Exploratory Analysis
This notebook uses the Solar Power Generation Data containing data of two solar power plant, where each plant has a power generation dataset and a sensor readings dataset.
The objective of this exercise is to use various data visualization techniques to explore the power generation and sensor readings of two solar power plants and their differences. The practice goals include data wrangling using data.table package and plotting against time/date/datetime.
Reference notebooks:
* Desc. Analytics of Solar Panels with R and Plotly
* Ensemble learning lib: MLens (Python)
* How to manage a solar power plant (Python)
* Solar Power Machine Learning I (Python)
* Solar_power_plant_analysis (Python)
Load libraries
library(janitor)
library(ggpubr)
library(lubridate)
library(tidyverse)
library(data.table)
library(PerformanceAnalytics)
library(corrplot)
view(data$p1_gen)
Import data
# load all data
load_p1_gen <- function() {
data <- fread("Plant_1_Generation_Data.csv")
return(data)
}
load_p1_weather <- function() {
data <- fread("Plant_1_Weather_Sensor_Data.csv")
return(data)
}
load_p2_gen <- function() {
data <- fread("Plant_2_Generation_Data.csv")
return(data)
}
load_p2_weather <- function() {
data <- fread("Plant_2_Weather_Sensor_Data.csv")
return(data)
}
# load data in a list
load <- function() {
data <- list()
data$p1_gen <- load_p1_gen()
data$p1_weather <- load_p1_weather()
data$p2_gen <- load_p2_gen()
data$p2_weather <- load_p2_weather()
return(data)
}
data <- load()
Data cleaning
# clean names
names(data$p1_gen) <- tolower(names(data$p1_gen))
names(data$p1_weather) <- tolower(names(data$p1_weather))
names(data$p2_gen) <- tolower(names(data$p2_gen))
names(data$p2_weather) <- tolower(names(data$p2_weather))
# parse datetime and factors
clean_p1_gen <- function(data) {
data[, date_time := dmy_hm(date_time)]
data[, plant_id := as.factor(plant_id)]
data[, source_key := as.factor(source_key)]
}
# parse datetime and factors
clean_data <- function(data) {
data[, date_time := as_datetime(date_time)]
data[, plant_id := as.factor(plant_id)]
data[, source_key := as.factor(source_key)]
}
# clean all data
clean <- function(data) {
data$p1_gen <- clean_p1_gen(data$p1_gen)
data$p1_weather <- clean_data(data$p1_weather)
data$p2_gen <- clean_data(data$p2_gen)
data$p2_weather <- clean_data(data$p2_weather)
return(data)
}
clean_data = clean(data)
summarise = function(data){lapply(data,summary)}
summarise(clean_data)
$p1_gen
date_time plant_id source_key dc_power ac_power daily_yield total_yield
Min. :2020-05-15 00:00:00 4135001:68778 bvBOhCH3iADSZry: 3155 Min. : 0 Min. : 0.00 Min. : 0 Min. :6183645
1st Qu.:2020-05-24 00:45:00 1BY6WEcLGh8j5v7: 3154 1st Qu.: 0 1st Qu.: 0.00 1st Qu.: 0 1st Qu.:6512003
Median :2020-06-01 14:30:00 7JYdWkrLSPkdwr4: 3133 Median : 429 Median : 41.49 Median :2659 Median :7146685
Mean :2020-06-01 08:02:49 VHMLBKoKgIrUVDU: 3133 Mean : 3147 Mean : 307.80 Mean :3296 Mean :6978712
3rd Qu.:2020-06-09 20:00:00 ih0vzX44oOqAx2f: 3130 3rd Qu.: 6367 3rd Qu.: 623.62 3rd Qu.:6274 3rd Qu.:7268706
Max. :2020-06-17 23:45:00 ZnxXDlPa8U1GXgE: 3130 Max. :14471 Max. :1410.95 Max. :9163 Max. :7846821
(Other) :49943
$p1_weather
date_time plant_id source_key ambient_temperature module_temperature irradiation
Min. :2020-05-15 00:00:00 4135001:3182 HmiyD2TTLFNqkNe:3182 Min. :20.40 Min. :18.14 Min. :0.00000
1st Qu.:2020-05-23 22:48:45 1st Qu.:22.71 1st Qu.:21.09 1st Qu.:0.00000
Median :2020-06-01 09:52:30 Median :24.61 Median :24.62 Median :0.02465
Mean :2020-06-01 05:52:22 Mean :25.53 Mean :31.09 Mean :0.22831
3rd Qu.:2020-06-09 16:56:15 3rd Qu.:27.92 3rd Qu.:41.31 3rd Qu.:0.44959
Max. :2020-06-17 23:45:00 Max. :35.25 Max. :65.55 Max. :1.22165
$p2_gen
date_time plant_id source_key dc_power ac_power daily_yield
Min. :2020-05-15 00:00:00 4136001:67698 81aHJ1q11NBPMrL: 3259 Min. : 0.0 Min. : 0.0 Min. : 0.0
1st Qu.:2020-05-23 21:00:00 9kRcWv60rDACzjR: 3259 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 272.8
Median :2020-06-01 23:00:00 LlT2YUhhzqhg5Sw: 3259 Median : 0.0 Median : 0.0 Median :2911.0
Mean :2020-06-01 10:44:33 LYwnQax7tkwH5Cb: 3259 Mean : 246.7 Mean : 241.3 Mean :3294.9
3rd Qu.:2020-06-09 23:30:00 oZZkBaNadn6DNKz: 3259 3rd Qu.: 446.6 3rd Qu.: 438.2 3rd Qu.:5534.0
Max. :2020-06-17 23:45:00 PeE6FRyGXUgsRhN: 3259 Max. :1420.9 Max. :1385.4 Max. :9873.0
(Other) :48144
total_yield
Min. :0.000e+00
1st Qu.:1.996e+07
Median :2.826e+08
Mean :6.589e+08
3rd Qu.:1.348e+09
Max. :2.248e+09
$p2_weather
date_time plant_id source_key ambient_temperature module_temperature irradiation
Min. :2020-05-15 00:00:00 4136001:3259 iq8k7ZNt4Mwm3w0:3259 Min. :20.94 Min. :20.27 Min. :0.00000
1st Qu.:2020-05-23 12:07:30 1st Qu.:24.60 1st Qu.:23.72 1st Qu.:0.00000
Median :2020-06-01 00:00:00 Median :26.98 Median :27.53 Median :0.01904
Mean :2020-06-01 00:04:35 Mean :28.07 Mean :32.77 Mean :0.23274
3rd Qu.:2020-06-09 12:07:30 3rd Qu.:31.06 3rd Qu.:40.48 3rd Qu.:0.43872
Max. :2020-06-17 23:45:00 Max. :39.18 Max. :66.64 Max. :1.09877
# drop singular variables
drop_sv <- function(data) {
data$p1_gen[, plant_id := NULL]
data$p1_weather[, plant_id := NULL][, source_key := NULL]
data$p2_gen[, plant_id := NULL]
data$p2_weather[, plant_id := NULL][, source_key := NULL]
return(data)
}
data2 = drop_sv(clean_data)
Pairplot
pairplot <- function(data2) {
data2[, date_time := NULL]
chart.Correlation(data2[,-1], histogram=TRUE, method=c("spearman"))
}
pairplot(data2$p1_gen)

Distribution
# function for generation distribution
plt_gen_dist <- function(data) {
x <- list(
title = "Value"
)
y <- list(
title = "Count"
)
ac_power <- ggplot(data, aes(x = ac_power)) + geom_histogram(alpha=0.7, fill="#457b9d")
dc_power <- ggplot(data, aes(x = dc_power)) + geom_histogram(alpha=0.7, fill="#457b9d")
daily_yield <- ggplot(data, aes(x = daily_yield)) + geom_histogram(alpha=0.7, fill="#457b9d")
total_yield <- ggplot(data, aes(x = total_yield)) + geom_histogram(alpha=0.7, fill="#457b9d")
ggarrange(ac_power, dc_power, daily_yield, total_yield, nrow = 2, ncol= 2)
}
#function for weather distribution
plt_wx_dist <- function(data) {
x <- list(
title = "Value"
)
y <- list(
title = "Number of occurences"
)
ambient <- ggplot(data, aes(x= ambient_temperature)) + geom_histogram(alpha=0.7, fill="#faa307")
module <- ggplot(data, aes(x= module_temperature)) + geom_histogram(alpha=0.7, fill="#faa307")
irradiation <- ggplot(data, aes(x= irradiation)) + geom_histogram(alpha=0.7, fill="#faa307")
ggarrange(ambient, irradiation, module, nrow=2, ncol=2)
}
Plant 1
fig1a = plt_gen_dist(data2$p1_gen)
annotate_figure(fig1a, top = text_grob("Plant 1: Power generation", size = 12))

fig1b = plt_wx_dist(data2$p1_weather)
annotate_figure(fig1b, top = text_grob("Plant 1: Sensor readings ", size = 12))

Plant 2
fig2a = plt_gen_dist(data2$p2_gen)
annotate_figure(fig2a, top = text_grob("Plant 2: Solar power generation", size = 12))

fig2b = plt_wx_dist(data2$p2_weather)
annotate_figure(fig2b, top = text_grob("Plant 2: Sensor readings ", size = 12))

Daily summed yield
# function
get_daily_summed_yield <- function(data) {
data[, day := date(date_time)]
data[, .(daily_yield_sum = sum(daily_yield)), by = day]
}
plt_daily_yield <- function(data) {
x <- list(
title = "Day"
)
y <- list(
title = "Summed daily yield"
)
plot <- ggplot(data, aes(x=day,y=daily_yield_sum)) + geom_point() + geom_smooth(method=lm, se=FALSE)
plot
}
daily_summed_yield_p1 <- get_daily_summed_yield(data2$p1_gen)
daily_summed_yield_p2 <- get_daily_summed_yield(data2$p2_gen)
fig3a = plt_daily_yield(daily_summed_yield_p1) + labs(title="Plant 1: Daily summed yield")
fig3b = plt_daily_yield(daily_summed_yield_p2) + labs(title="Plant 2: Daily summed yield")
ggarrange(fig3a,fig3b, ncol=2, nrow=1)

Data preparation
# plant 1
# reduced_p1_gen
reduced_p1_gen = data2$p1_gen
reduced_p1_gen2 = reduced_p1_gen[,lapply(.SD, sum, na.rm=TRUE), by=list(date_time), .SDcols=c("dc_power","ac_power","daily_yield","total_yield")]
reduced_p1_gen2[,date:=date(date_time)]
reduced_p1_gen2[,time:=as.ITime(date_time)]
reduced_p1_gen2$time = as.POSIXct(strptime(reduced_p1_gen2$time, format="%H:%M:%S"))
# merge reduced_p1_gen and p1_wx
p1_wx= data2$p1_weather
setkey(p1_wx,date_time)
setkey(reduced_p1_gen2,date_time)
p1= p1_wx[reduced_p1_gen2, nomatch=0]
dim(p1)
[1] 3157 10
# plant 2
# merge plant 2 data
reduced_p2_gen = data2$p2_gen
reduced_p2_gen2 = reduced_p2_gen[,lapply(.SD, sum, na.rm=TRUE), by=list(date_time), .SDcols=c("dc_power","ac_power","daily_yield","total_yield")]
reduced_p2_gen2[,date:=date(date_time)]
reduced_p2_gen2[,time:=as.ITime(date_time)]
reduced_p2_gen2$time = as.POSIXct(strptime(reduced_p2_gen2$time, format="%H:%M:%S"))
# merge p2 gen with p2 wx
p2_wx= data2$p2_weather
setkey(p2_wx,date_time)
setkey(reduced_p2_gen2,date_time)
p2= p2_wx[reduced_p2_gen2, nomatch=0]
dim(p2)
[1] 3259 10
Plant 1
P1: dc power
# dc_power (time)
xlabel= c("00:00:00","06:00:00","12:00:00","18:00:00")
dc1a = ggplot(p1, aes(x=time, y=dc_power)) + geom_point(size=0.2, color="#457b9d",alpha=0.7) + stat_summary(aes(y=dc_power,group=1), fun.y=mean, color="red",geom="line",group=1) + scale_x_datetime(date_labels="%H:%S")
# dc_power (daily)
dc1b = ggplot(p1, aes(x=date, y=dc_power)) + geom_col(fill="#457b9d") + theme(axis.text.x=element_text(angle=45))
ggarrange(dc1a, dc1b, labels = c("a", "b"), ncol=2, nrow=1)

- DC power
- plant 1 produces power from ~06.00 to ~18.00
- maximum power on May 25 2020
P1: daily yield
# daily_yield
dy1a =ggplot(p1, aes(x=time, y=daily_yield)) + geom_point(size=0.2, color="#457b9d",alpha=0.5) + stat_summary(aes(y=daily_yield,group=1), fun.y=mean, color="red",geom="line",group=1) + scale_x_datetime(date_labels="%H:%S")
# daily_yield facet
dy1b = ggplot(p1, aes(x=time, y=daily_yield)) + geom_point(size=0.2) + facet_wrap(~date) + scale_y_continuous(breaks=c(0, 100000, 200000)) + theme(axis.text.x=element_blank())
ggarrange(dy1a,dy1b,labels = c("a", "b"), nrow=1, ncol=2)

# boxplot
dy1c = ggplot(p1, aes(x=factor(date),y=daily_yield)) + geom_boxplot() + theme_bw() + theme(axis.text.x=element_text(angle=90)) + labs(x="date")
# barplot
dy1d = ggplot(p1, aes(x=factor(date),y=daily_yield)) + geom_col(fill="#457b9d") + theme_bw() + theme(axis.text.x=element_text(angle=90)) + labs(x="date")
ggarrange(dy1c,dy1d, labels = c("c", "d"),nrow=1, ncol=2 )

- Daily yield
- daily yield decreases after 18.00.
- there are missing data on some dates for example, 2020-05-20.
- daily yield changes daily, and there are no outliers observed.
- the sum of daily yield changes daily.
P1: ambient temperature
# ambient temp (time)
at1a = ggplot(p1, aes(x=time, y=ambient_temperature)) + geom_point(size=0.2, color="#457b9d",alpha=0.5) + stat_summary(aes(y=ambient_temperature,group=1), fun.y=mean, color="red",geom="line",group=1) + scale_x_datetime(date_labels="%H:%S")
# boxplot
at1b = ggplot(p1, aes(x=factor(date),y=ambient_temperature)) + geom_boxplot() + theme_bw() + theme(axis.text.x=element_text(angle=90)) + labs(x="date", y="temperature (°C)")
# lineplots
dat = p1[,.(mean_at=mean(ambient_temperature)), .(date)]
at1c = ggplot(dat, aes(x=date, y=mean_at)) + geom_line(color="#457b9d") + labs(y="mean_ambient_temperature (°C)")
cols= c('mean_at')
dat[,(paste0(cols, "_pctChange")) := lapply(.SD, function(col){
(col-shift(col,1,type = "lag"))/shift(col,1,type = "lag")
}), .SDcols=cols]
at1d = ggplot(dat, aes(x=date, y=mean_at_pctChange)) + geom_line(color="#faa307") + scale_y_continuous(labels=scales::percent)
ggarrange(at1a,at1b, labels = c("a", "b"), ncol=2, nrow=1)

ggarrange(at1c,at1d, labels = c("c", "d"),ncol=2, nrow=1)

- Ambient temperature
- the ambient temperature of records in May is higher than June.
- the range of ambient temperature percentage change is larger in May than June.
# time series plot
# sesonality 7 days
ts_at = ts(dat$mean_at, frequency = 7)
stl_at = stl(ts_at, "periodic")
plot(stl_at)

P1: module temperature
mt1a = ggplot(p1, aes(x=time, y=module_temperature)) + geom_point(size=0.2, color="#457b9d",alpha=0.7) + stat_summary(aes(y=module_temperature,group=1), fun.y=mean, color="red",geom="line",group=1) + scale_x_datetime(date_labels="%H:%S")
# boxplot
mt1b = ggplot(p1, aes(x=factor(date),y=module_temperature)) + geom_boxplot() + theme_bw() + theme(axis.text.x=element_text(angle=90)) + labs(x="date", y="temperature (°C)")
# lineplots
dmt = p1[,.(mean_mt=mean(module_temperature)), .(date)]
mt1c = ggplot(dmt, aes(x=date, y=mean_mt)) + geom_line(color="#457b9d") + labs(y="mean_ambient_temperature (°C)")
cols= c('mean_mt')
dmt[,(paste0(cols, "_pctChange")) := lapply(.SD, function(col){
(col-shift(col,1,type = "lag"))/shift(col,1,type = "lag")
}), .SDcols=cols]
mt1d = ggplot(dmt, aes(x=date, y=mean_mt_pctChange)) + geom_line(color="#faa307") + scale_y_continuous(labels=scales::percent)
ggarrange(mt1a,mt1b,labels = c("a", "b"), ncol=2, nrow=1)

ggarrange(mt1c,mt1d, labels = c("c", "d"),ncol=2, nrow=1)

- there are four dates with outliers
P1: irradiation
# plot
ir1a = ggplot(p1, aes(x=time, y=irradiation)) + geom_point(size=0.2, color="#457b9d",alpha=0.7) + stat_summary(aes(y=irradiation,group=1), fun.y=mean, color="red",geom="line",group=1) + scale_x_continuous(breaks=c(0,21600,43200,64800), labels=xlabel)
# boxplot
ir1b = ggplot(p1, aes(x=factor(date),y=irradiation)) + geom_boxplot() + theme_bw() + theme(axis.text.x=element_text(angle=90)) + labs(x="date")
# line plot
irr = p1[,.(sum_irr=sum(irradiation)), .(date)]
ir1c = ggplot(irr, aes(x=date, y=sum_irr)) + geom_line(color="#457b9d")
ggarrange(ir1b,
ggarrange(ir1a, ir1c, ncol = 2), nrow = 2
)

P1: spearman correlation
colnames(p1)
[1] "date_time" "ambient_temperature" "module_temperature" "irradiation" "dc_power" "ac_power"
[7] "daily_yield" "total_yield" "date" "time" "delta_temperature"
# delta temperature
p1$delta_temperature = abs(p1$ambient_temperature-p1$module_temperature)
summary(p1$delta_temperature)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.0023 1.5742 2.4745 7.4173 13.2152 35.2430
# correlation
p1_c = p1[,-c(1,9,10)]
chart.Correlation(p1_c, histogram=TRUE, method=c("spearman"))

- daily yield and total yield are not correlated with other features
# correlation heatmap without daily_yield and total_yield
# function
cors <- function(df) {
M <- Hmisc::rcorr(as.matrix(df),type=c("spearman"))
Mdf <- map(M, ~data.frame(.x))
return(Mdf) }
formatted_cors <- function(df){
cors(df) %>%
map(~rownames_to_column(.x, var="measure1")) %>%
map(~pivot_longer(.x, -measure1, "measure2")) %>%
bind_rows(.id = "id") %>%
pivot_wider(names_from = id, values_from = value) %>%
mutate(sig_p = ifelse(P < .05, T, F), p_if_sig = ifelse(P <.05, P, NA), r_if_sig = ifelse(P <.05, r, NA)) }
# plot
p1_c = p1[,-c(1,7,8,9,10)]
formatted_cors(p1_c) %>%
ggplot(aes(measure1, measure2, fill=r, label=round(r_if_sig,3))) +
geom_tile() +
labs(x = NULL, y = NULL, fill = "Spearman's\nCorrelation", title="Plant 1: Correlations", subtitle="without daily_yield and total_yield") +
scale_fill_gradient2(mid="#e0fbfc",low="#ee6c4d",high="#293241", limits=c(0,1)) +
geom_text(color="white") +
scale_x_discrete(expand=c(0,0)) +
scale_y_discrete(expand=c(0,0)) +
theme(axis.text.x=element_text(angle=90))

P1: reg plots
# reg plot
p1a = ggscatter(p1, x="dc_power",y="ac_power", add="reg.line", color="#8F3931FF",alpha=0.5) + theme_minimal()
p1b =ggscatter(p1, x="ambient_temperature",y="dc_power", add="reg.line", color="#767676FF",alpha=0.5) + theme_minimal()
p1c=ggscatter(p1, x="module_temperature",y="dc_power", add="reg.line", color="#FFA319FF",alpha=0.5) + theme_minimal()
p1d =ggscatter(p1, x="irradiation",y="dc_power", add="reg.line", color="#58593FFF",alpha=0.5) + theme_minimal()
p1e =ggscatter(p1, x="delta_temperature",y="dc_power", add="reg.line", color="#155F83FF",alpha=0.5) + theme_minimal()
p1f =ggscatter(p1, x="delta_temperature",y="irradiation", add="reg.line", color="#C16622FF",alpha=0.5) + theme_minimal()
ggarrange(p1a, p1b, p1c, p1d, p1e, p1f, labels= c("a","b","c","d","e","f"),ncol=3, nrow=2)

- Plant 1 Reg plots
- inverters convert dc power to ac power linearly.
- dc power increases non linearly with ambient temperature.
- some linearity between dc power production and module temperature.
- dc power increases with irradiation.
- dc power is influenced by delta temperature.
- some linearity between irradiation and delta temperature.
- Plant 1 summary
- yield (daily_yield and total_yield) is not correlated to ac/dc power, temperature and irradiation.
- transfer function between ac and dc power is linear.
- dc power is influenced by ambient temperature, module temperature, irradiation and heat transfer between air and module.
- all (n=22) inverters of Plant 1 lost around 90% of the dc power during conversion.
Plant 1 vs. Plant 2
# dc power (daily)
pp1= ggplot(data=reduced_p1_gen2) + geom_col(aes(x=date, y=dc_power, fill='plant 1')) + geom_col(data=reduced_p2_gen2, aes(x=date, y=dc_power,fill='plant 2')) + scale_fill_manual(values=c("#457b9d","#faa307")) + labs(fill="", title= "DC power (daily)") + theme(title =element_text(size=9))
# dc power (time)
pp2 = ggplot(data=reduced_p1_gen2) + geom_point(aes(x=time, y=dc_power, color='plant 1'),size=0.3,alpha=0.6) + geom_point(data=reduced_p2_gen2, aes(x=time, y=dc_power,color='plant 2'), size=0.3,alpha=0.9) + scale_color_manual(values=c("#457b9d","#faa307")) + labs(fill="", title="AC power (time)") + scale_x_datetime(date_label="%H:%M:%S")+ theme(title =element_text(size=9))
# ac power (daily)
pp3= ggplot(data=reduced_p1_gen2) + geom_col(aes(x=date, y=ac_power, fill='plant 1')) + geom_col(data=reduced_p2_gen2, aes(x=date, y=ac_power,fill='plant 2')) + scale_fill_manual(values=c("#457b9d","#faa307")) + labs(fill="", title= "AC power (daily)")+ theme(title =element_text(size=9))
# ac power (time)
pp4 = ggplot(data=reduced_p1_gen2) + geom_point(aes(x=time, y=ac_power, color='plant 1'),size=0.3,alpha=0.6) + geom_point(data=reduced_p2_gen2, aes(x=time, y=ac_power,color='plant 2'), size=0.3,alpha=0.9) + scale_color_manual(values=c("#457b9d","#faa307")) + labs(fill="", title="AC power (time)") + scale_x_datetime(date_label="%H:%M:%S")+ theme(title =element_text(size=9))
# daily yield (sum for each date)
reduced_p1_dyd = reduced_p1_gen2[,lapply(.SD, sum, na.rm=TRUE), by=list(date), .SDcols=c("daily_yield")]
reduced_p2_dyd = reduced_p2_gen2[,lapply(.SD, sum, na.rm=TRUE), by=list(date), .SDcols=c("daily_yield")]
pp5 = ggplot(data=reduced_p1_dyd) + geom_col(aes(x=date, y=daily_yield, fill='plant 1')) + geom_col(data=reduced_p2_dyd, aes(x=date, y=daily_yield,fill='plant 2')) + scale_fill_manual(values=c("#457b9d","#faa307")) + labs(fill="", title= "Daily yield (date)")+ theme(title =element_text(size=9))
# average total yield
reduced_p1_aty = reduced_p1_gen2[,lapply(.SD, mean, na.rm=TRUE), by=list(date), .SDcols=c("total_yield")]
reduced_p2_aty = reduced_p2_gen2[,lapply(.SD, mean, na.rm=TRUE), by=list(date), .SDcols=c("total_yield")]
pp6 = ggplot(data=reduced_p2_aty) + geom_col(aes(x=date, y=total_yield, fill='plant 2')) + geom_col(data=reduced_p1_aty, aes(x=date, y=total_yield,fill='plant 1')) + scale_fill_manual(values=c("#457b9d","#faa307")) + labs(fill="", title= "Average total yield")+ theme(title =element_text(size=9))
ggarrange(pp1, pp2, pp3, pp4, pp5, pp6, labels= c("a","b","c","d","e","f"),ncol=3, nrow=2, common.legend = TRUE, legend = "top")


- Plant 1 and Plant 2 generation
- Plant 1 produced around 6 times more dc power than Plant 2.
- Plant 1 produces more ac power than Plant 2.
- Both plants produced similar daily yield (for each date).
- Large difference between Plant 1 and Plant 2 average total yield (for each date).
p1_wx_ir= p1_wx[,time:=as.ITime(date_time)]
p1_wx_ir$time = as.POSIXct(strptime(p1_wx_ir$time, format="%H:%M:%S"))
p2_wx= data2$p2_weather
p2_wx_ir= p2_wx[,time:=as.ITime(date_time)]
p2_wx_ir$time = as.POSIXct(strptime(p2_wx_ir$time, format="%H:%M:%S"))
# irradiation
irp1 = ggplot(data=p1_wx_ir) + geom_point(aes(x=time, y=irradiation, color='plant 1'),size=0.3,alpha=0.6) + geom_point(data=p2_wx_ir, aes(x=time, y=irradiation,color='plant 2'), size=0.3,alpha=0.9) + scale_color_manual(values=c("#457b9d","#faa307")) + labs(fill="", title="Irradiation (time)") + scale_x_datetime(date_label="%H:%M:%S")+ theme(title =element_text(size=9))
# temperature: ambient and module
temp_p1 = ggplot(data=p1_wx_ir) + geom_point(aes(x=time, y=ambient_temperature, color='Ambient'),size=0.3,alpha=0.7) + geom_point(data=p1_wx_ir, aes(x=time, y=module_temperature,color='Module'), size=0.3,alpha=0.7) + scale_color_manual(values=c("#9c6644","#00509d")) + labs(title="Plant 1",color="Temperature") + scale_x_datetime(date_label="%H:%M:%S")+ theme(title =element_text(size=9))
temp_p2 = ggplot(data=p2_wx_ir) + geom_point(aes(x=time, y=ambient_temperature, color='Ambient'),size=0.3,alpha=0.7) + geom_point(data=p2_wx_ir, aes(x=time, y=module_temperature,color='Module'), size=0.3,alpha=0.7) + scale_color_manual(values=c("#9c6644","#00509d")) + labs(title="Plant 2",color="Temperature") + scale_x_datetime(date_label="%H:%M:%S") + theme(title =element_text(size=9))
ggarrange(irp1,
ggarrange(temp_p1, temp_p2, ncol = 2), nrow = 2
)

- Plant 1 and Plant 2 sensor
- both plants have similar irradiation by time
- both plants have similar temperature (ambient and module) by time
# daily yield by source key
da1 = data$p1_gen
dap1= ggplot(da1, aes(x=source_key,y=daily_yield)) + geom_boxplot() + coord_flip() + labs(title="Plant 1") + theme(title =element_text(size=9))
da2 = data$p2_gen
dap2 =ggplot(da2, aes(x=source_key,y=daily_yield)) + geom_boxplot() + coord_flip() + labs(title="Plant 2") + theme(title =element_text(size=9))
dap = ggarrange(dap1,dap2, ncol=2, nrow=1)
annotate_figure(dap, top = text_grob("Daily yield by source key", size = 12))

- Plant 1 and Plant 2 source keys
- both plants have 22 source keys each.
- There are more differences in the median daily yield (datetime) between source keys in Plant 2 than in Plant 1.
Plant 2
P2: new variables
# new variables
p2$delta_temperature = abs(p2$ambient_temperature-p2$module_temperature)
p2 = within(p2, diff_daily_yield <- c(NA,diff(daily_yield)))
p2 = within(p2, diff_total_yield <- c(NA,diff(total_yield)))
p2 = within(p2, diff_ambient_temperature <- c(NA,diff(ambient_temperature)))
p2 = within(p2, diff_module_temperature <- c(NA,diff(module_temperature)))
p2 = within(p2, diff_ac_power <- c(NA,diff(ac_power)))
head(p2)
P2: spearman correlation
# get spearman correlation
p2c = p2[,-c(1,9,10)]
corr_mat=cor(p2c, use="complete.obs", method="spearman") #create Spearman correlation matrix
# p.mat function
cor.mtest <- function(mat, ...) {
mat <- as.matrix(mat)
n <- ncol(mat)
p.mat<- matrix(NA, n, n)
diag(p.mat) <- 0
for (i in 1:(n - 1)) {
for (j in (i + 1):n) {
tmp <- cor.test(mat[, i], mat[, j], ...)
p.mat[i, j] <- p.mat[j, i] <- tmp$p.value
}
}
colnames(p.mat) <- rownames(p.mat) <- colnames(mat)
p.mat
}
# get p.mat
p.mat <- cor.mtest(p2c, method="s",use="complete.obs")
col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))
cor.mtest <- function(mat, ...) {
mat <- as.matrix(mat)
n <- ncol(mat)
p.mat<- matrix(NA, n, n)
diag(p.mat) <- 0
for (i in 1:(n - 1)) {
for (j in (i + 1):n) {
tmp <- cor.test(mat[, i], mat[, j], ...)
p.mat[i, j] <- p.mat[j, i] <- tmp$p.value
}
}
colnames(p.mat) <- rownames(p.mat) <- colnames(mat)
p.mat
}
# plot
corrplot(corr_mat, method="color", col=col(200),
type="upper", order="hclust",
addCoef.col = "black", # Add coefficient of correlation
tl.col="black", tl.srt=90, #Text label color and rotation
# Combine with significance
p.mat = p.mat, sig.level = 0.01, insig = "blank",
# hide correlation coefficient on the principal diagonal
diag=FALSE, number.cex=.6, tl.cex=.6
)

- Plant 2’s total yield is negatively correlated to all features, except for daily yield.
P2: reg plots
# reg plots
p2a = ggscatter(p2, x="dc_power",y="ac_power", add="reg.line", color="#8F3931FF",alpha=0.6, size=1) + theme_bw()
p2b = ggscatter(p2, x="ac_power",y="diff_daily_yield", add="reg.line", color="#767676FF",alpha=0.6,size=1) + theme_bw()
p2c = ggscatter(p2, x="irradiation",y="diff_daily_yield", add="reg.line", color="#FFA319FF",alpha=0.6, size=1) + theme_bw()
p2d = ggscatter(p2, x="module_temperature",y="diff_daily_yield", add="reg.line", color="#58593FFF",alpha=0.6, size=1) + theme_bw()
p2e =ggscatter(p2, x="delta_temperature",y="diff_daily_yield", add="reg.line", color="#155F83FF",alpha=0.6, size=1) + theme_bw()
p2f = ggscatter(p2, x="diff_daily_yield",y="diff_total_yield", add="reg.line", color="#C16622FF",alpha=0.6, size=1) + theme_bw()
p2g = ggscatter(p2, x="diff_module_temperature",y="diff_ac_power", add="reg.line", color="#350E20FF",alpha=0.6, size=1) + theme_bw() + labs(x="diff_module_temp")
ggarrange(p2a, p2b, p2c, p2d, p2e, p2f, p2g, labels= c("a","b","c","d","e","f","g"),ncol=3, nrow=3)

- Plant 2 reg plots
- inverter lost 0% of the power as dc power = ac power.
- diff_daily_yield (next minus previous) is:
- positive when ac power > 20,000 KW.
- positive or negative with the variation of irradiation
- negative when the module temperature is below 30°C, and PV panel product the energy if temperature is around 35°C.
- negative when delta temperature is < 5°C, daily yield decreases every 15 minutes if the difference in module and ambient temperature is < 5°C.
- there is more diff_ac_power when the diff_module_temp is between -5°C and 5°C.
- Summary
- Plant 1 produces 6 times more DC power than plant 2 and loses 90% of it when converting to AC power.
- No losses in Plant 2 when converting DC to AC power.
- AC power output and daily yield are similar for both plants.
- There is a large difference between Plant 1 and Plant 2 average total yield; Plant 2 average total yield is higher than Plant 1.
- Daily yield decreases when the delta temperature is < 5°C
