Visualizing the Missing value :

Time span of Data:
Data is of: 131.9931 Days
No of NA Available in Data:
[1] "Missing Values: 0"
[1] "Incomplete Records: 0"
Parameter in Data Set:
names(df1)
[1] "Amb_Temp" "G1L1" "G1L2" "G1L3" "DE_Temp" "NDE_Temp" "Grid" "Power"
[9] "C1" "C2" "C3" "Asymmt"
table(df1$Grid == 600)
FALSE TRUE
3679 14933
df1$Grid <- ifelse(df1$Grid < 600,"GNC","GC")
df1$Grid <- as.factor(df1$Grid)
str(df1)
'data.frame': 18612 obs. of 12 variables:
$ Amb_Temp: num 30.3 30 29.3 29 28.2 27.9 27.1 27 27 27 ...
$ G1L1 : num 68 67.7 66.4 65 64.5 65.2 66 66 66 66.9 ...
$ G1L2 : num 69 68.3 67.1 65.5 64.9 66 66 66.8 67 67.3 ...
$ G1L3 : num 68 67.6 66.3 65 64.4 65 65.6 66 66 66.2 ...
$ DE_Temp : num 54 54 54 53.4 53 52.1 52 52 52 52 ...
$ NDE_Temp: num 55 55 55 55 54 54 53.1 53 53 53 ...
$ Grid : Factor w/ 2 levels "GC","GNC": 1 2 2 2 1 1 1 1 1 1 ...
$ Power : num 60 2.4 -3.5 5.4 104.5 ...
$ C1 : num 43.2 23.4 9.1 20.1 70.6 ...
$ C2 : num 80.8 29.7 13.6 24.4 120.5 ...
$ C3 : num 82.6 37.3 10.8 22.6 89.7 ...
$ Asymmt : num 14 5.8 1.8 1.9 14.8 13 12.4 12.6 14.9 14.4 ...
df1<- as.data.frame(df1)
No of event for Grid Connection :
Summarising Data:
Data preparation for Plotting:
[1] "Grid" "variable" "value"
Usually it is been said that During Grid connection Turbine is generating atmost , so lets check wheather its true with data.
So Lets start with Density plot: and see for behavior of parameter when Grid is connected to 100%.
GC : Grid Connection(Turbine at Maximum Generation) 600seconds.
NGC : Non Grid Connection !600 seconds.
# Density Plots
ggplot(data = df.melt, aes(x = value)) +
facet_wrap(~variable, scales = "free") +
theme_dark() +
theme(axis.text.x=element_blank())+
geom_density(aes(fill = Grid),alpha = .5)+
ggtitle("Density Plots")

Distribution of other Parameters

ggplot(df1 , aes(x=Power ,group=Grid ,fill=Grid) )+
geom_density(position = "identity",alpha=.4)+ scale_x_continuous(limits = c(0, 1000))+
scale_color_brewer(palette = "Set1")

ggplot(df1 , aes(Power)) +
geom_histogram(binwidth = 5,col = "blue",fill = "skyblue")+
scale_x_continuous(limits = c(1, 1000))

Power Distribution During Grid Connect and Non-Grid Connect:
p1=ggplot(df1, aes(Grid, Power, fill = Grid)) + geom_boxplot(show.legend = NA) + ggtitle("Power Distribution")+ labs(x="WTG",y="Temperature")
ggplotly(p1)
Here with the boxplot, Turbine is generating power during Grid Connection 600sec.
dodge = position_dodge(width = 2)
ggplot(df.melt, aes(x=variable, y=value,fill=Grid))+
geom_violin(width = 3,position = dodge) +
geom_boxplot(width = 0.5,position = dodge)+
labs(x="features", y = "values")+
facet_wrap(~variable, scales = "free") +
ggtitle("Violin Plot") +
theme(axis.text.x=element_blank())

Correlation of Entire data frame :
library(ggcorrplot);library(psych)
# For Numerical Variables
nums <- sapply(df1, is.numeric)
num_df <- df1[,c(nums)]
pairs.panels(num_df)

Almost all the variable have a significance which is above : 0.7 apart from ambient temperature.
pr.com <- prcomp(num_df,scale. = TRUE)
summary(pr.com)
Importance of components:
PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9
Standard deviation 2.9663 1.1415 0.72099 0.48494 0.33448 0.16959 0.04151 0.02116 0.009729
Proportion of Variance 0.7999 0.1184 0.04726 0.02138 0.01017 0.00261 0.00016 0.00004 0.000010
Cumulative Proportion 0.7999 0.9184 0.96562 0.98700 0.99717 0.99978 0.99994 0.99998 0.999990
PC10 PC11
Standard deviation 0.008656 0.006185
Proportion of Variance 0.000010 0.000000
Cumulative Proportion 1.000000 1.000000
pr.var <- (pr.com$sdev)^2
pve <- pr.var/sum(pr.var)
par(mfrow = c(2, 2))
plot(pve, xlab = "Principal Component",
ylab = "Proportion of Variance Explained",
ylim = c(0, 1), type = "b")
plot(cumsum(pve), xlab = "Principal Component",
ylab = "Cumulative Proportion of Variance Explained",
ylim = c(0, 1), type = "b")
prop_varex <- pr.var/sum(pr.var)
plot(prop_varex, xlab = "Principal Component",
ylab = "Proportion of Variance Explained",
type = "b")

---
title: "EDA: Generator System"
output: html_notebook
---

```{r, message=FALSE, warning=FALSE, include=FALSE}
rm(list=ls())
library(readxl);library(Amelia);library(reshape);library(ggplot2);library(plotly)
setwd("H:\\Product factory\\Generator System")
#df<- fread("504102.Data Explorer (WTG).xlsx")
df<- read_excel("504102.Data Explorer (WTG).xlsx")
```

Visualizing the Missing value :
```{r, echo=FALSE, fig.height=6, fig.width=11, message=FALSE, warning=FALSE}
missmap(df)
```


Time span of Data:
```{r, echo=FALSE, message=FALSE, warning=FALSE}
#names(df)
Local_Time <- df[,c(1)]# Separating Local Time 
names(Local_Time)[1] <- "Time" 
cat("Data is of:",(max(Local_Time$Time)-min(Local_Time$Time)),"Days")
```



```{r, include=FALSE}


df <- df[,-c(1)]
#seq(0,ncol(df),12)#0  12  24  36  48  60  72  84  96 108 120 132 144 156 168 180 192
start = 0 
end=start + 12
df1 <- df[,c(start:end)]
names(df1) <- c("Amb_Temp","G1L1","G1L2","G1L3","DE_Temp","NDE_Temp","Grid","Power","C1","C2","C3","Asymmt")

df1=df1[complete.cases(df1), ]
df1 <- as.data.frame(df1)
```

No of NA Available in Data:  
```{r, echo=FALSE, message=FALSE, warning=FALSE}
check<-function(df){
      # count NA (missing values)
      NAs<-sum(is.na(df))
      print(paste("Missing Values:", NAs))
      
      # count incomplete records (rows containing missing values)
      ok<-complete.cases(df)
      print(paste("Incomplete Records:", sum(! ok)))
      
      # Show incomplete records (if less than 100 NAs). 
      if(NAs > 0 & NAs <= 100) print( df[which(! complete.cases(df)), ] )
      
      # If more than 100, show column-wise distribution of NAs.
      if (NAs > 100) hist(which(is.na(df), arr.ind=TRUE)[,2], xlab="Column", freq=TRUE, breaks=1:dim(df)[2], main="Column-wise distribution of missing values",labels = TRUE,col="grey")
    }
check(df1)   
```




```{r, eval=FALSE, include=FALSE}
colSums(is.na(df1))
df0 = df1
remove_outliers <- function(x, na.rm = TRUE, ...){
      qnt <- quantile(x, probs=c(.25, .75), na.rm = na.rm, ...)
      H <- 1.5 * IQR(x, na.rm = na.rm)
      y <- x
      y[x < (qnt[1] - H)] <- qnt[1]
      y[x > (qnt[2] + H)] <- qnt[2]
      y
    } 
    dff=apply(df0[,c(1:ncol(df0))] , 2 , remove_outliers)
    for(j in 1:ncol(dff)){dff[is.na(dff[,j]), j] <- mean(dff[,j], na.rm = TRUE)}
    df1 <- dff
rm(dff)
```

Parameter in Data Set:  
```{r}
names(df1)
```

No of event for Grid Connection :  
```{r, echo=FALSE, message=FALSE, warning=FALSE}
table(df1$Grid == 600)
```


Summarising Data:  
```{r, echo=FALSE, message=FALSE, warning=FALSE}
df1$Grid <- ifelse(df1$Grid < 600,"GNC","GC")
df1$Grid <- as.factor(df1$Grid)
str(df1)
df1<- as.data.frame(df1)
```


Data preparation for Plotting:  
```{r, echo=FALSE, message=FALSE, warning=FALSE}
df.melt = melt(df1, id.vars = 'Grid')
names(df.melt)
head(df.melt)
```

Usually it is been said that During Grid connection Turbine is generating atmost , so lets check wheather its true with data.   

So Lets start with Density plot: and see for behavior of parameter when Grid is connected to 100%.      
GC  :  Grid Connection(Turbine at Maximum Generation) 600seconds.    
NGC :  Non Grid Connection !600 seconds.    
```{r, fig.height=6, fig.width=11, message=FALSE, warning=FALSE}
# Density Plots
ggplot(data = df.melt, aes(x = value)) +
 facet_wrap(~variable, scales = "free") +
 theme_dark() +
 theme(axis.text.x=element_blank())+
 geom_density(aes(fill = Grid),alpha = .5)+
 ggtitle("Density Plots")
```


Distribution of other Parameters  

```{r, echo=FALSE, fig.height=6, fig.width=11, message=FALSE, warning=FALSE}
# Histograms and density lines
par(mfrow=c(3, 4))
colnames <- dimnames(df1)[[2]]
for (i in 1:(ncol(df1))) {
  
  if (is.numeric(df1[,i]))
    {hist(df1[,i], main=colnames[i], probability=TRUE, col="gray", border="#CD5C5C")
    d <- density(df1[,i])
    lines(d, col="red")}
}
```



```{r, fig.height=6, fig.width=11, message=FALSE, warning=FALSE}
ggplot(df1 , aes(x=Power ,group=Grid ,fill=Grid) )+ 
         geom_density(position = "identity",alpha=.4)+ scale_x_continuous(limits = c(0, 1000))+
         scale_color_brewer(palette = "Set1")
```



```{r, fig.height=6, fig.width=11, message=FALSE, warning=FALSE}
ggplot(df1 , aes(Power)) + 
  geom_histogram(binwidth = 5,col =  "blue",fill =  "skyblue")+ 
  scale_x_continuous(limits = c(1, 1000)) 

```



Power Distribution During Grid Connect and Non-Grid Connect:
```{r, fig.height=6, fig.width=11, message=FALSE, warning=FALSE}

p1=ggplot(df1, aes(Grid, Power, fill = Grid)) + geom_boxplot(show.legend = NA) + ggtitle("Power Distribution")+ labs(x="WTG",y="Temperature") 
ggplotly(p1)
```

Here with the boxplot, Turbine is generating power during Grid Connection 600sec.    


```{r, fig.height=6, fig.width=11, message=FALSE, warning=FALSE}
dodge = position_dodge(width = 2)
ggplot(df.melt, aes(x=variable, y=value,fill=Grid))+
    geom_violin(width = 3,position = dodge) +
    geom_boxplot(width = 0.5,position = dodge)+
    labs(x="features", y = "values")+
    facet_wrap(~variable, scales = "free") +
    ggtitle("Violin Plot") +
    theme(axis.text.x=element_blank())

```


Correlation of Entire data frame :
```{r, fig.height=6, fig.width=11, message=FALSE, warning=FALSE}
library(ggcorrplot);library(psych)
# For Numerical Variables
nums <- sapply(df1, is.numeric)
num_df <- df1[,c(nums)]
pairs.panels(num_df)
```


Almost all the variable have a significance which is above : 0.7 apart from ambient temperature.   


```{r}
pr.com <- prcomp(num_df,scale. = TRUE)
summary(pr.com)
```



```{r}
pr.var <- (pr.com$sdev)^2
pve <- pr.var/sum(pr.var)

par(mfrow = c(2, 2))

plot(pve, xlab = "Principal Component",
     ylab = "Proportion of Variance Explained",
     ylim = c(0, 1), type = "b")

plot(cumsum(pve), xlab = "Principal Component",
     ylab = "Cumulative Proportion of Variance Explained",
     ylim = c(0, 1), type = "b")

prop_varex <- pr.var/sum(pr.var)
plot(prop_varex, xlab = "Principal Component",
             ylab = "Proportion of Variance Explained",
             type = "b")
```



