A combined cycle power plant (CCPP) is composed of gas turbines (GT), steam turbines (ST) and heat recovery steam generators.In a CCPP, the electricity is generated by gas and steam turbines, which are combined in one cycle, and is transferred from one turbine to another. While the Vacuum is collected from and has effect on the Steam Turbine, the other three of the ambient variables effect the GT performance. For comparability with our baseline studies, and to allow 5x2 fold statistical tests be carried out, we provide the data shuffled five times. For each shuffling 2-fold CV is carried out and the resulting 10 measurements are used for statistical testing. Using the data provided, we have to predict the Energy Output (PE).

Exploratory data analysis

dim(mydata)
## [1] 9568    5
str(mydata)
## Classes 'tbl_df', 'tbl' and 'data.frame':    9568 obs. of  5 variables:
##  $ AT: num  14.96 25.18 5.11 20.86 10.82 ...
##  $ V : num  41.8 63 39.4 57.3 37.5 ...
##  $ AP: num  1024 1020 1012 1010 1009 ...
##  $ RH: num  73.2 59.1 92.1 76.6 96.6 ...
##  $ PE: num  463 444 489 446 474 ...
library(Hmisc)
## Warning: package 'Hmisc' was built under R version 3.5.1
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
## 
##     format.pval, units
describe(mydata)
## mydata 
## 
##  5  Variables      9568  Observations
## ---------------------------------------------------------------------------
## AT 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     9568        0     2773        1    19.65    8.578     7.46     9.34 
##      .25      .50      .75      .90      .95 
##    13.51    20.34    25.72    29.24    30.92 
## 
## lowest :  1.81  2.34  2.58  2.64  2.71, highest: 35.10 35.20 35.56 35.77 37.11
## ---------------------------------------------------------------------------
## V 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     9568        0      634        1    54.31    14.45    38.73    39.72 
##      .25      .50      .75      .90      .95 
##    41.74    52.08    66.54    71.58    73.68 
## 
## lowest : 25.36 25.88 34.03 34.69 35.19, highest: 79.05 79.74 80.18 80.25 81.56
## ---------------------------------------------------------------------------
## AP 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     9568        0     2517        1     1013    6.683     1004     1006 
##      .25      .50      .75      .90      .95 
##     1009     1013     1017     1021     1023 
## 
## lowest :  992.89  993.11  993.31  993.74  993.82
## highest: 1033.14 1033.19 1033.25 1033.29 1033.30
## ---------------------------------------------------------------------------
## RH 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     9568        0     4546        1    73.31    16.58    46.58    52.76 
##      .25      .50      .75      .90      .95 
##    63.33    74.97    84.83    91.00    94.60 
## 
## lowest :  25.56  25.89  26.30  26.67  28.16, highest: 100.12 100.13 100.14 100.15 100.16
## ---------------------------------------------------------------------------
## PE 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     9568        0     4836        1    454.4    19.56    430.8    433.4 
##      .25      .50      .75      .90      .95 
##    439.8    451.5    468.4    479.0    483.5 
## 
## lowest : 420.26 421.57 425.11 425.12 425.14, highest: 495.21 495.23 495.24 495.35 495.76
## ---------------------------------------------------------------------------
summary(mydata)
##        AT              V               AP               RH        
##  Min.   : 1.81   Min.   :25.36   Min.   : 992.9   Min.   : 25.56  
##  1st Qu.:13.51   1st Qu.:41.74   1st Qu.:1009.1   1st Qu.: 63.33  
##  Median :20.34   Median :52.08   Median :1012.9   Median : 74.97  
##  Mean   :19.65   Mean   :54.31   Mean   :1013.3   Mean   : 73.31  
##  3rd Qu.:25.72   3rd Qu.:66.54   3rd Qu.:1017.3   3rd Qu.: 84.83  
##  Max.   :37.11   Max.   :81.56   Max.   :1033.3   Max.   :100.16  
##        PE       
##  Min.   :420.3  
##  1st Qu.:439.8  
##  Median :451.6  
##  Mean   :454.4  
##  3rd Qu.:468.4  
##  Max.   :495.8

set.seed(100)
df <- mydata[sample(nrow(mydata),200),]
df
## # A tibble: 200 x 5
##       AT     V    AP    RH    PE
##    <dbl> <dbl> <dbl> <dbl> <dbl>
##  1  31.2  68.7 1006.  52.1  437.
##  2  12.7  44.9 1008.  71.8  463.
##  3  32.4  44.4 1005.  44.4  430.
##  4  17.5  63.9 1020.  82.6  453.
##  5  11.7  40.2 1018.  82.5  475.
##  6  22.0  51.2 1009.  82.6  447.
##  7  17.3  58.6 1016.  87.2  455.
##  8  24.3  56.6 1014.  70.4  443.
##  9  22.3  44.9 1010.  78.9  447.
## 10  28.6  69.8 1004.  67.3  428.
## # ... with 190 more rows
nrow(df)
## [1] 200
attach(df)
## The following objects are masked from mydata:
## 
##     AP, AT, PE, RH, V
par(mfrow=c(2,2))
hist(AT, col = "Light Blue")
hist(V, col="Green")
hist(AP, col="blue")
hist(RH,col="Orange")

hist(PE, col = "Pink")
plot(df, col="Blue")

Simple Linear Regression - Independent Variable Ambient Temperature

cor.test(PE,AT)
## 
##  Pearson's product-moment correlation
## 
## data:  PE and AT
## t = -48.106, df = 198, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.9694332 -0.9471683
## sample estimates:
##        cor 
## -0.9597832
plot(AT,PE, col="Red", main="Energy Produced Vs Ambient Temp")

fitPE1 <- lm(PE ~ AT, data=df)
summary(fitPE1)
## 
## Call:
## lm(formula = PE ~ AT, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.6247  -3.7930   0.0608   3.5700  17.9346 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 497.23212    0.97080  512.19   <2e-16 ***
## AT           -2.20199    0.04577  -48.11   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.005 on 198 degrees of freedom
## Multiple R-squared:  0.9212, Adjusted R-squared:  0.9208 
## F-statistic:  2314 on 1 and 198 DF,  p-value: < 2.2e-16

Simple Linear Regression - Independent Variable Exhaust Vacuum

cor.test(PE,V)
## 
##  Pearson's product-moment correlation
## 
## data:  PE and V
## t = -22.622, df = 198, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.8837608 -0.8052626
## sample estimates:
##        cor 
## -0.8491356
plot(V,PE, col="Red", main="Energy Produced Vs Exhaust Vacuum")

fitPE2 <- lm(PE ~ V, data=df)
summary(fitPE2)
## 
## Call:
## lm(formula = PE ~ V, data = df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -36.22  -5.54   0.68   6.09  20.39 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 520.26200    3.01481  172.57   <2e-16 ***
## V            -1.22236    0.05403  -22.62   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.416 on 198 degrees of freedom
## Multiple R-squared:  0.721,  Adjusted R-squared:  0.7196 
## F-statistic: 511.8 on 1 and 198 DF,  p-value: < 2.2e-16

Simple Linear Regression - Independent Variable Ambient Pressure

cor.test(PE,AP)
## 
##  Pearson's product-moment correlation
## 
## data:  PE and AP
## t = 6.9888, df = 198, p-value = 4.137e-11
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.3262174 0.5496451
## sample estimates:
##       cor 
## 0.4448257
plot(AP,PE, col="Red", main="Energy Produced Vs Ambient Pressure")

fitPE3 <- lm(PE ~ AP, data=df)
summary(fitPE3)
## 
## Call:
## lm(formula = PE ~ AP, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -30.696 -11.633  -3.776  10.749  43.704 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -989.0473   206.4478  -4.791 3.25e-06 ***
## AP             1.4241     0.2038   6.989 4.14e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.97 on 198 degrees of freedom
## Multiple R-squared:  0.1979, Adjusted R-squared:  0.1938 
## F-statistic: 48.84 on 1 and 198 DF,  p-value: 4.137e-11

Simple Linear Regression - Independent Variable Relative Humidity

cor.test(PE,RH)
## 
##  Pearson's product-moment correlation
## 
## data:  PE and RH
## t = 7.5716, df = 198, p-value = 1.371e-12
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.3586883 0.5748007
## sample estimates:
##       cor 
## 0.4738484
plot(RH,PE, col="Red", main="Energy Produced Vs Relative Humidity")

fitPE4 <- lm(PE ~ RH, data=df)
summary(fitPE4)
## 
## Call:
## lm(formula = PE ~ RH, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -31.264 -11.644  -2.894  11.719  37.520 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 412.92830    5.50384  75.025  < 2e-16 ***
## RH            0.56009    0.07397   7.572 1.37e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.7 on 198 degrees of freedom
## Multiple R-squared:  0.2245, Adjusted R-squared:  0.2206 
## F-statistic: 57.33 on 1 and 198 DF,  p-value: 1.371e-12

Summary of Simple Linear Regression Models:

Multiple Linear Regression

library("corrplot")
## Warning: package 'corrplot' was built under R version 3.5.1
## corrplot 0.84 loaded
cm <- cor(df, method="pearson")
corrplot(cm,method = "circle", type="upper", order = "hclust", tl.col = "black", tl.srt = 45)

From the above correlation plot, we see that Ambient Temperature and Exhaust Vacuum have a strong negative correlation with Energy Produced, where as Relative Humidity has lesser correlation with Energy Produced.

set.seed(100)
fitPE5 <- lm (PE ~ AT+V+AP+RH, data=df)
summary(fitPE5)
## 
## Call:
## lm(formula = PE ~ AT + V + AP + RH, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.1031  -2.9245  -0.5918   3.2924  15.9344 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 465.82947   64.66649   7.204 1.25e-11 ***
## AT           -2.13080    0.09374 -22.732  < 2e-16 ***
## V            -0.17330    0.04760  -3.641 0.000348 ***
## AP            0.04965    0.06326   0.785 0.433527    
## RH           -0.14922    0.02737  -5.451 1.50e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.374 on 195 degrees of freedom
## Multiple R-squared:  0.9407, Adjusted R-squared:  0.9395 
## F-statistic: 773.4 on 4 and 195 DF,  p-value: < 2.2e-16

End of Document