setwd("C:\\Users\\harsh\\Desktop\\data\\CCPP")
CCPP<- read.csv("data.csv")
library(ggplot2, quietly = TRUE)
library(knitr, quietly = TRUE)
dim(CCPP)
## [1] 9568 5
Features consist of hourly average ambient variables
1. Temperature (T) in the range 1.81°C and 37.11°C,
2. Ambient Pressure (AP) in the range 992.89-1033.30 milibar,
3. Relative Humidity (RH) in the range 25.56% to 100.16%
4. Exhaust Vacuum (V) in teh range 25.36-81.56 cm Hg
5. Net hourly electrical energy output (EP) 420.26-495.76 MW
The averages are taken from various sensors located around the plant that record the ambient variables every second. The variables are given without normalization. Almost all of the values of the variables are different and there is no different category in any variable.(No point in making contigency tables.)
Visualization of the variables of this study
attach(CCPP)
par(mfrow=c(2,3))
boxplot(AT, main="Temperature", horizontal = TRUE, col = "tomato3")
boxplot(V, main="Exhaust Vaccum", horizontal = TRUE, col = "tomato3")
boxplot(AP, main="Ambient Pressure", horizontal = TRUE, col ="tomato3" )
boxplot(RH, main="Relative Humidity", horizontal = TRUE, col = "tomato3")
boxplot(PE, main="Electic Energy Output of the Plant", horizontal = TRUE, col = "tomato3")
par(mfrow=c(2,3))

hist(AT, main="Ambient Temperature", breaks=20, col="yellow", freq = FALSE)
lines(density(AT, bw=10), type = "l", col="red", lwd=2)
hist(V, main="Exhaust Vaccum", breaks=20, col="yellow", freq = FALSE)
lines(density(V, bw=10), type = "l", col="red", lwd=2)
hist(AP, main="Ambient Pressure", breaks=20, col="yellow", freq = FALSE)
lines(density(AP, bw=10), type = "l", col="red", lwd=2)
hist(RH, main="Relative Humidity", breaks=20, col="yellow", freq = FALSE)
lines(density(RH, bw=10), type = "l", col="red", lwd=2)
hist(PE, main="Electric Energy Output of the Plant", breaks=20, col="yellow", freq = FALSE)
lines(density(PE, bw=10), type = "l", col="red", lwd=2)
par(mfrow=c(2,2))

ggplot(CCPP, aes(x=AT, y=PE))+
geom_point(col="coral")+
geom_smooth(model = lm)+
labs(x="Ambient Temperature", y="Electric Energy Output of the Plant")
## Warning: Ignoring unknown parameters: model
## `geom_smooth()` using method = 'gam'

ggplot(CCPP, aes(x=V, y=PE))+
geom_point(col="coral")+
geom_smooth(model = lm)+
labs(x="Exhaust Vaccum", y="Electric Energy Output of the Plant")
## Warning: Ignoring unknown parameters: model
## `geom_smooth()` using method = 'gam'

ggplot(CCPP, aes(x=AP, y=PE))+
geom_point(col="coral")+
geom_smooth(model = lm)+
labs(x="Ambient Pressure", y="Electric Energy Output of the Plant")
## Warning: Ignoring unknown parameters: model
## `geom_smooth()` using method = 'gam'

ggplot(CCPP, aes(x=RH, y=PE))+
geom_point(col="coral")+
geom_smooth(model = lm)+
labs(x="Relative Humidity", y="Electric Energy Output of the Plant")
## Warning: Ignoring unknown parameters: model
## `geom_smooth()` using method = 'gam'

Creating a Correlation Matrix Heatmap
matrix <- round(cor(CCPP),2)
# Getting lower triangle of the correlation matrix
get_lower_tri<-function(matrix){
matrix[upper.tri(matrix)] <- NA
return(matrix)
}
# Getting upper triangle of the correlation matrix
get_upper_tri <- function(matrix){
matrix[lower.tri(matrix)]<- NA
return(matrix)
}
upper_tri <- get_upper_tri(matrix)
# Melt the correlation matrix
library(reshape2)
melted_cormat <- melt(upper_tri, na.rm = TRUE)
# Heatmap
library(ggplot2)
#reordering the correlation matrix according to the correlation coefficient is useful to identify the hidden pattern in the matrix. hclust for hierarchical clustering order is used below in the funtion defined.
reorder_cormat <- function(cormat){
# Use correlation between variables as distance
dd <- as.dist((1-cormat)/2)
hc <- hclust(dd)
cormat <-cormat[hc$order, hc$order]
}
# Reorder the correlation matrix
cormat <- reorder_cormat(matrix)
upper_tri <- get_upper_tri(matrix)
# Melt the correlation matrix
melted_cormat <- melt(upper_tri, na.rm = TRUE)
# Create a ggheatmap
ggheatmap <- ggplot(melted_cormat, aes(Var2, Var1, fill = value))+
geom_tile(color = "white")+
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
midpoint = 0, limit = c(-1,1), space = "Lab",
name="Pearson\nCorrelation") +
theme_minimal()+ # minimal theme
theme(axis.text.x = element_text(angle = 45, vjust = 1,
size = 12, hjust = 1))+
coord_fixed()
#Adding correlation coefficients on the heatmap
#Using geom_text() to add the correlation coefficients on the graph
#Using a blank theme (remove axis labels, panel grids and background, and axis ticks)
#Using guides() to change the position of the legend title
ggheatmap +
geom_text(aes(Var2, Var1, label = value), color = "black", size = 4) +
theme(
axis.title.x = element_blank(),
axis.title.y = element_blank(),
panel.grid.major = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.ticks = element_blank(),
legend.justification = c(1, 0),
legend.position = c(0.6, 0.7),
legend.direction = "horizontal")+
guides(fill = guide_colorbar(barwidth = 7, barheight = 1,
title.position = "top", title.hjust = 0.5))

ScatterPlots of Electric Energy Output of the Plant vs. Other Variables in the Dataset.
library(hexbin)
library(RColorBrewer)
bin1 <- hexbin(AT, PE, xbins = 40)
bin2 <- hexbin(V, PE, xbins = 40)
bin3 <- hexbin(AP, PE, xbins = 40)
bin4 <- hexbin(RH, PE, xbins = 40)
my_colors=colorRampPalette(rev(brewer.pal(11,'Spectral')))
plot(bin1, main="", xlab= "Ambient Temperature", ylab="", colramp=my_colors)

plot(bin2, main="", xlab= "Exhaust Vaccum", ylab="", colramp=my_colors)

plot(bin3, main="", xlab= "Ambient Pressure", ylab="", colramp=my_colors)

plot(bin4, main="", xlab= "Relative Humidity", ylab="", colramp=my_colors)

Null Hypothesis : The net hourly electrical energy output of the plant is independent of the Ambient Temperature, Exhaust Vaccum, Ambient Pressure and Relative Humidity values detected by the sensors during that hour.
Chi-squared Tests of Independence
tbl1 <- table(AT,PE)
tbl2 <- table(V,PE)
tbl3 <- table(AP,PE)
tbl4 <- table(RH,PE)
chisq.test(tbl1)
## Warning in chisq.test(tbl1): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: tbl1
## X-squared = 14014000, df = 13403000, p-value < 2.2e-16
chisq.test(tbl2)
## Warning in chisq.test(tbl2): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: tbl2
## X-squared = 3056600, df = 3060600, p-value = 0.9431
chisq.test(tbl3)
## Warning in chisq.test(tbl3): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: tbl3
## X-squared = 12456000, df = 12165000, p-value < 2.2e-16
chisq.test(tbl4)
## Warning in chisq.test(tbl4): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: tbl4
## X-squared = 22004000, df = 21975000, p-value = 6.578e-06
According to Chi-squared Tests of Independence, 3 out of 4 variables didn’t follow the null hypothesis as their p values are very less than 0.05. Hence for Ambient Temperature, Ambient Pressure and Relative humidity, the null hypothesis fails. Though according to this test, Exhaust vaccum follows the null hypothesis.
Regression Analysis
fit <- lm(PE~AT+V+AP+RH,CCPP)
library(sjPlot)
sjt.lm(fit, show.header = TRUE, emph.p = TRUE, string.dv = "The net hourly electrical energy output of the plant")
|
Predictors
|
The net hourly electrical energy output of the plant
|
|
PE
|
|
|
|
B
|
CI
|
p
|
|
(Intercept)
|
|
454.61
|
435.50 – 473.72
|
<.001
|
|
AT
|
|
-1.98
|
-2.01 – -1.95
|
<.001
|
|
V
|
|
-0.23
|
-0.25 – -0.22
|
<.001
|
|
AP
|
|
0.06
|
0.04 – 0.08
|
<.001
|
|
RH
|
|
-0.16
|
-0.17 – -0.15
|
<.001
|
|
Observations
|
|
9568
|
|
R2 / adj. R2
|
|
.929 / .929
|
Regression Diagnostics
The assumptions for linear regression are:
Linearity: The relationship between X and the mean of Y is linear.
Homoscedasticity: The variance of residual is the same for any value of X.
Independence: Observations are independent of each other.
Normality: For any fixed value of X, Y is normally distributed.
Component+Residual Plots
library(car)
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
library(broom)
crPlots(fit)

aug.data = augment(fit) ###Store residuals into dataframe
Residual Plots
library(ggfortify)
## Warning: namespace 'DBI' is not available and has been replaced
## by .GlobalEnv when processing object 'exact'
## Warning: namespace 'DBI' is not available and has been replaced
## by .GlobalEnv when processing object 'exact'
autoplot(fit, label.size = 3)

Testing Outliers
outlierTest(fit)
## rstudent unadjusted p-value Bonferonni p
## 1581 -9.577798 1.2395e-21 1.1859e-17
## 3231 -9.487390 2.9376e-21 2.8107e-17
## 2206 -8.448690 3.3759e-17 3.2301e-13
## 5949 -6.835304 8.6831e-12 8.3080e-08
## 3540 -6.820705 9.6082e-12 9.1931e-08
## 3024 -6.233478 4.7546e-10 4.5492e-06
## 9496 -5.924763 3.2355e-09 3.0957e-05
## 6561 -5.745868 9.4249e-09 9.0177e-05
## 8592 -5.702510 1.2156e-08 1.1631e-04
## 4798 -5.207803 1.9507e-07 1.8664e-03
Though there are some outliers, but when the number is compared with total number of datapoints, it becomes insignificant.
Checking Multicollinearity
Theory
Variance inflation factors measure the inflation in the variances of the parameter estimates due to collinearities that exist among the predictors. It is a measure of how much the variance of the estimated regression coefficient is “inflated” by the existence of correlation among the predictor variables in the model. A VIF of 1 means that there is no correlation among the kth predictor and the remaining predictor variables, and hence the variance of the estimated regression coefficient is not inflated at all. The general rule of thumb is that VIFs exceeding 4 warrant further investigation, while VIFs exceeding 10 are signs of serious multicollinearity requiring correction.
vif(fit)
## AT V AP RH
## 5.977602 3.943003 1.452639 1.705290
sqrt(vif(fit)) > 2
## AT V AP RH
## TRUE FALSE FALSE FALSE
Testing Heteroscadasticity
Theory
Heteroscadasticity refers to the circumstance in which the variability of a variable is unequal across the range of values of a second variable that predicts it.
Non-constant Variance Score Test : This test is more prominently know as Breusch-Pagan test. It is a test for heteroscedasticity. In a standard linear model, the variance of the residuals are assumed to be constant (i.e. independent) over the values of the response (fitted values).
ncvTest(fit)
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 7.513071 Df = 1 p = 0.006125284
The p value is <0.05. Therefore we can reject the null hypothesis that the variance of the residuals is constant. Hence, there is a heteroscadasticity issue associated with the model.
Choosing the Best Model
mod1 = lm(PE ~ AT + RH + AP, data=CCPP)
mod2 = lm(PE ~ AT + RH + V, data=CCPP)
mod3 = lm(PE ~ AT + V + AP, data=CCPP)
mod4 = lm(PE ~ V + RH + AP, data=CCPP)
mod5 = lm(PE ~ V + RH, data=CCPP)
mod6 = lm(PE ~ RH + AP, data=CCPP)
mod7 = lm(PE ~ AT + RH, data=CCPP)
mod8 = lm(PE ~ AT + AP, data=CCPP)
mod9 = lm(PE ~ V+ AP, data=CCPP)
mod10 = lm(PE ~ AT + V, data=CCPP)
par(mfrow=c(2,5))
sjt.lm(mod1, show.header = TRUE, emph.p = TRUE, string.dv = "The net hourly electrical energy output of the plant")
|
Predictors
|
The net hourly electrical energy output of the plant
|
|
PE
|
|
|
|
B
|
CI
|
p
|
|
(Intercept)
|
|
490.32
|
470.34 – 510.31
|
<.001
|
|
AT
|
|
-2.38
|
-2.40 – -2.36
|
<.001
|
|
RH
|
|
-0.20
|
-0.21 – -0.20
|
<.001
|
|
AP
|
|
0.03
|
0.01 – 0.04
|
.010
|
|
Observations
|
|
9568
|
|
R2 / adj. R2
|
|
.921 / .921
|
sjt.lm(mod2, show.header = TRUE, emph.p = TRUE, string.dv = "The net hourly electrical energy output of the plant")
|
Predictors
|
The net hourly electrical energy output of the plant
|
|
PE
|
|
|
|
B
|
CI
|
p
|
|
(Intercept)
|
|
518.55
|
517.79 – 519.31
|
<.001
|
|
AT
|
|
-2.02
|
-2.05 – -1.99
|
<.001
|
|
RH
|
|
-0.17
|
-0.17 – -0.16
|
<.001
|
|
V
|
|
-0.23
|
-0.24 – -0.21
|
<.001
|
|
Observations
|
|
9568
|
|
R2 / adj. R2
|
|
.928 / .928
|
sjt.lm(mod3, show.header = TRUE, emph.p = TRUE, string.dv = "The net hourly electrical energy output of the plant")
|
Predictors
|
The net hourly electrical energy output of the plant
|
|
PE
|
|
|
|
B
|
CI
|
p
|
|
(Intercept)
|
|
344.07
|
324.51 – 363.63
|
<.001
|
|
AT
|
|
-1.63
|
-1.66 – -1.61
|
<.001
|
|
V
|
|
-0.33
|
-0.34 – -0.31
|
<.001
|
|
AP
|
|
0.16
|
0.14 – 0.18
|
<.001
|
|
Observations
|
|
9568
|
|
R2 / adj. R2
|
|
.918 / .918
|
sjt.lm(mod4, show.header = TRUE, emph.p = TRUE, string.dv = "The net hourly electrical energy output of the plant")
|
Predictors
|
The net hourly electrical energy output of the plant
|
|
PE
|
|
|
|
B
|
CI
|
p
|
|
(Intercept)
|
|
-74.97
|
-103.73 – -46.22
|
<.001
|
|
V
|
|
-1.00
|
-1.02 – -0.99
|
<.001
|
|
RH
|
|
0.16
|
0.15 – 0.17
|
<.001
|
|
AP
|
|
0.56
|
0.54 – 0.59
|
<.001
|
|
Observations
|
|
9568
|
|
R2 / adj. R2
|
|
.804 / .804
|
sjt.lm(mod5, show.header = TRUE, emph.p = TRUE, string.dv = "The net hourly electrical energy output of the plant")
|
Predictors
|
The net hourly electrical energy output of the plant
|
|
PE
|
|
|
|
B
|
CI
|
p
|
|
(Intercept)
|
|
503.59
|
502.28 – 504.90
|
<.001
|
|
V
|
|
-1.11
|
-1.13 – -1.10
|
<.001
|
|
RH
|
|
0.15
|
0.14 – 0.16
|
<.001
|
|
Observations
|
|
9568
|
|
R2 / adj. R2
|
|
.772 / .772
|
sjt.lm(mod6, show.header = TRUE, emph.p = TRUE, string.dv = "The net hourly electrical energy output of the plant")
|
Predictors
|
The net hourly electrical energy output of the plant
|
|
PE
|
|
|
|
B
|
CI
|
p
|
|
(Intercept)
|
|
-985.49
|
-1031.41 – -939.58
|
<.001
|
|
RH
|
|
0.40
|
0.38 – 0.42
|
<.001
|
|
AP
|
|
1.39
|
1.35 – 1.44
|
<.001
|
|
Observations
|
|
9568
|
|
R2 / adj. R2
|
|
.384 / .384
|
sjt.lm(mod7, show.header = TRUE, emph.p = TRUE, string.dv = "The net hourly electrical energy output of the plant")
|
Predictors
|
The net hourly electrical energy output of the plant
|
|
PE
|
|
|
|
B
|
CI
|
p
|
|
(Intercept)
|
|
516.48
|
515.69 – 517.26
|
<.001
|
|
AT
|
|
-2.39
|
-2.41 – -2.38
|
<.001
|
|
RH
|
|
-0.21
|
-0.21 – -0.20
|
<.001
|
|
Observations
|
|
9568
|
|
R2 / adj. R2
|
|
.921 / .921
|
sjt.lm(mod8, show.header = TRUE, emph.p = TRUE, string.dv = "The net hourly electrical energy output of the plant")
|
Predictors
|
The net hourly electrical energy output of the plant
|
|
PE
|
|
|
|
B
|
CI
|
p
|
|
(Intercept)
|
|
349.95
|
328.45 – 371.45
|
<.001
|
|
AT
|
|
-2.11
|
-2.13 – -2.10
|
<.001
|
|
AP
|
|
0.14
|
0.12 – 0.17
|
<.001
|
|
Observations
|
|
9568
|
|
R2 / adj. R2
|
|
.901 / .901
|
sjt.lm(mod9, show.header = TRUE, emph.p = TRUE, string.dv = "The net hourly electrical energy output of the plant")
|
Predictors
|
The net hourly electrical energy output of the plant
|
|
PE
|
|
|
|
B
|
CI
|
p
|
|
(Intercept)
|
|
-45.67
|
-75.57 – -15.76
|
.003
|
|
V
|
|
-1.06
|
-1.08 – -1.05
|
<.001
|
|
AP
|
|
0.55
|
0.52 – 0.58
|
<.001
|
|
Observations
|
|
9568
|
|
R2 / adj. R2
|
|
.787 / .787
|
sjt.lm(mod10, show.header = TRUE, emph.p = TRUE, string.dv = "The net hourly electrical energy output of the plant")
|
Predictors
|
The net hourly electrical energy output of the plant
|
|
PE
|
|
|
|
B
|
CI
|
p
|
|
(Intercept)
|
|
505.48
|
505.01 – 505.95
|
<.001
|
|
AT
|
|
-1.70
|
-1.73 – -1.68
|
<.001
|
|
V
|
|
-0.32
|
-0.34 – -0.31
|
<.001
|
|
Observations
|
|
9568
|
|
R2 / adj. R2
|
|
.916 / .916
|
Now, we refine our selection to mod1 and mod2.
Checking Multicollinearity
sqrt(vif(mod1)) > 2
## AT RH AP
## FALSE FALSE FALSE
sqrt(vif(mod2)) > 2
## AT RH V
## TRUE FALSE FALSE
Testing Heteroscadasticity
ncvTest(mod1)
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 0.6041578 Df = 1 p = 0.436996
ncvTest(mod2)
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 3.169887 Df = 1 p = 0.07500765
Hence we select mod1.
Coefficients of the Model
coef(mod1)
## (Intercept) AT RH AP
## 490.32374639 -2.37770850 -0.20383189 0.02537245
So, the model with predictor variables AT, RH & AP produces the better energy output.
Final Model : Energy Output (PE) = 490.323746 -2.377708Temperature(AT) - 0.203832Humidity(RH) + 0.025372 Pressure(AP)
Summarizing
Because air density varies inversely with ambient temperature, the air mass flow rate entering in a typical machine of specific size and rotational speed is reduced on a hot day. The air temperature has a large influence on the power output and efficiency of a gas turbine.Power output is reduced when the temperature is above the standard ambient temperature
Similarly as the relative Humidity increases, it inversely affects the air flow mass rate hence decreasing the net energy output.
As for the ambient pressure, more the ambient pressure, more the air flow rate. Hence affecting the effciency positively.
But overall, Ambient Temperature is relatively the most important variable for this linear model.