Glimpse of Data:
data<-read.csv(file="WIND_VXE_2013.csv", header=TRUE)
#data$PCTimeStamp= dmy_hm( as.character(data$PCTimeStamp))
head(data)
# cat("The Data is of:",max(data$PCTimeStamp,na.rm = TRUE)-min(data$PCTimeStamp,na.rm = TRUE),"Days","Date From:",min(data$PCTimeStamp,na.rm = TRUE),"To:",max(data$PCTimeStamp,na.rm = TRUE))
# min(data$PCTimeStamp,na.rm = TRUE)
# max(data$PCTimeStamp,na.rm = TRUE)-min(data$PCTimeStamp,na.rm = TRUE)
# cat("The Data is of:",max(data$PCTimeStamp,na.rm = TRUE)-min(data$PCTimeStamp,na.rm = TRUE),"Days From\n")
# min(data$PCTimeStamp,na.rm = TRUE)
# assign new column names to be more succint (avoid spaces, use _ or . to seperate words)
new.names<-c("date_time","T1_Possible_Power","T2_Possible_Power","T3_Possible_Power","T4_Possible_Power","T5_Possible_Power","T6_Possible_Power","T7_Possible_Power","T1_Total_Active_Power","T2_Total_Active_Power","T3_Total_Active_Power","T4_Total_Active_Power","T5_Total_Active_Power","T6_Total_Active_Power","T7_Total_Active_Power","mean_wind_mps", "min_wind_mps", "max_wind_mps", "cum_energy_delivered_kwh")
# make sure the new names lineup properly with the ones they're replacing...
cbind(names(data), new.names)
new.names
[1,] "PCTimeStamp" "date_time"
[2,] "WTG01_Grid.Production.PossiblePower.Avg...1." "T1_Possible_Power"
[3,] "WTG02_Grid.Production.PossiblePower.Avg...2." "T2_Possible_Power"
[4,] "WTG03_Grid.Production.PossiblePower.Avg...3." "T3_Possible_Power"
[5,] "WTG04_Grid.Production.PossiblePower.Avg...4." "T4_Possible_Power"
[6,] "WTG05_Grid.Production.PossiblePower.Avg...5." "T5_Possible_Power"
[7,] "WTG06_Grid.Production.PossiblePower.Avg...6." "T6_Possible_Power"
[8,] "WTG07_Grid.Production.PossiblePower.Avg...7." "T7_Possible_Power"
[9,] "WTG01_Total.Active.power..8." "T1_Total_Active_Power"
[10,] "WTG02_Total.Active.power..9." "T2_Total_Active_Power"
[11,] "WTG03_Total.Active.power..10." "T3_Total_Active_Power"
[12,] "WTG04_Total.Active.power..11." "T4_Total_Active_Power"
[13,] "WTG05_Total.Active.power..12." "T5_Total_Active_Power"
[14,] "WTG06_Total.Active.power..13." "T6_Total_Active_Power"
[15,] "WTG07_Total.Active.power..14." "T7_Total_Active_Power"
[16,] "MET_Avg..Wind.speed.1..15." "mean_wind_mps"
[17,] "MET_Min..Wind.speed.1..16." "min_wind_mps"
[18,] "MET_Max..Wind.speed.1..17." "max_wind_mps"
[19,] "GRID1_KWH_DEL" "cum_energy_delivered_kwh"
# if yes, replace column names with new names
names(data)<-new.names
No of Data Type Available in Data
**VARIABLES in Data:**
Main data: 19 Columns 52560 Observation
------------------------------------------------------------------------------------------------------
DATA TYPE: Numerical 18 Observation: 52560
Numerical Colums are:
T1_Possible_Power T2_Possible_Power T3_Possible_Power T4_Possible_Power T5_Possible_Power T6_Possible_Power T7_Possible_Power T1_Total_Active_Power T2_Total_Active_Power T3_Total_Active_Power T4_Total_Active_Power T5_Total_Active_Power T6_Total_Active_Power T7_Total_Active_Power mean_wind_mps min_wind_mps max_wind_mps cum_energy_delivered_kwh
------------------------------------------------------------------------------------------------------
DATA TYPE: Categorical data Observation:
Categorical Colums are:
------------------------------------------------------------------------------------------------------
Missing value : Before Pre-processing
missmap(data)
cat("\n ***********NA VALUE COLUMN WISE*********** \n")
***********NA VALUE COLUMN WISE***********
sort(colSums((is.na(data))),decreasing = TRUE)
T3_Possible_Power T3_Total_Active_Power T1_Possible_Power T1_Total_Active_Power
927 927 725 725
T2_Possible_Power T7_Possible_Power T2_Total_Active_Power T7_Total_Active_Power
710 710 710 710
T5_Possible_Power T5_Total_Active_Power T4_Possible_Power T4_Total_Active_Power
685 685 654 654
T6_Possible_Power T6_Total_Active_Power cum_energy_delivered_kwh mean_wind_mps
652 652 59 3
min_wind_mps max_wind_mps date_time
3 3 0
cat("\n ----------------------------------------------------------\n")
----------------------------------------------------------
summary(data)
date_time T1_Possible_Power T2_Possible_Power T3_Possible_Power T4_Possible_Power T5_Possible_Power
01-01-13 00:00: 1 Min. : -3.0 Min. : -3.0 Min. : -2.0 Min. : -3.0 Min. : -3.0
01-01-13 00:10: 1 1st Qu.:191.0 1st Qu.:204.0 1st Qu.:214.0 1st Qu.:222.0 1st Qu.:192.0
01-01-13 00:20: 1 Median :518.0 Median :530.0 Median :552.0 Median :598.0 Median :553.0
01-01-13 00:30: 1 Mean :475.9 Mean :483.5 Mean :496.8 Mean :517.6 Mean :495.3
01-01-13 00:40: 1 3rd Qu.:772.0 3rd Qu.:774.0 3rd Qu.:792.0 3rd Qu.:819.0 3rd Qu.:805.0
01-01-13 00:50: 1 Max. :850.0 Max. :850.0 Max. :850.0 Max. :850.0 Max. :850.0
(Other) :52554 NA's :725 NA's :710 NA's :927 NA's :654 NA's :685
T6_Possible_Power T7_Possible_Power T1_Total_Active_Power T2_Total_Active_Power T3_Total_Active_Power
Min. : -2.0 Min. : -6 Min. :3109970 Min. : 609852 Min. :3254759
1st Qu.:206.0 1st Qu.:180 1st Qu.:3895622 1st Qu.:1391641 1st Qu.:4066341
Median :537.0 Median :497 Median :4744043 Median :2341906 Median :5022455
Mean :489.3 Mean :472 Mean :4608851 Mean :2189450 Mean :4870726
3rd Qu.:786.0 3rd Qu.:785 3rd Qu.:5262894 3rd Qu.:2894952 3rd Qu.:5586471
Max. :850.0 Max. :850 Max. :6045048 Max. :3690817 Max. :6413840
NA's :652 NA's :710 NA's :725 NA's :710 NA's :927
T4_Total_Active_Power T5_Total_Active_Power T6_Total_Active_Power T7_Total_Active_Power mean_wind_mps min_wind_mps
Min. :3341303 Min. :3230186 Min. :3264175 Min. :3136754 Min. : 0.00 Min. : 0.0
1st Qu.:4168935 1st Qu.:4023712 1st Qu.:4085929 1st Qu.:3919523 1st Qu.: 5.60 1st Qu.: 3.6
Median :5159420 Median :4986563 Median :5057922 Median :4875312 Median : 8.50 Median : 6.0
Mean :5003720 Mean :4827587 Mean :4903693 Mean :4712335 Mean : 8.08 Mean : 5.6
3rd Qu.:5736883 3rd Qu.:5531891 3rd Qu.:5624685 3rd Qu.:5410488 3rd Qu.:10.90 3rd Qu.: 7.8
Max. :6575858 Max. :6326853 Max. :6451012 Max. :6193951 Max. :18.80 Max. :15.8
NA's :654 NA's :685 NA's :652 NA's :710 NA's :3 NA's :3
max_wind_mps cum_energy_delivered_kwh
Min. : 0.00 Min. : 78
1st Qu.: 7.60 1st Qu.:2741038
Median :11.10 Median :5152314
Mean :10.55 Mean :5052061
3rd Qu.:13.90 3rd Qu.:7152592
Max. :31.50 Max. :9999699
NA's :3 NA's :59
Subset Data type
# subset data by type
Cumulative<-subset(data, select=c(1,19))
Possible<-subset(data, select=1:8)
Active<-subset(data, select=c(1,9:15))
Wind<-subset(data, select=c(1,16:18))
# select data to use
dat<-data # keep all the data
# similarly, you could select all the data besides Active Power
# dat<-merge(Possible, Cumulative, Wind) # keep all but Active Power
Preview the raw data
dim(dat)
[1] 52560 19
str(dat)
'data.frame': 52560 obs. of 19 variables:
$ date_time : Factor w/ 52560 levels "01-01-13 00:00",..: 1 2 3 4 5 6 7 8 9 10 ...
$ T1_Possible_Power : int 817 732 764 773 689 735 782 814 730 727 ...
$ T2_Possible_Power : int 805 790 774 769 690 753 818 796 735 773 ...
$ T3_Possible_Power : int 786 763 793 759 711 808 736 789 736 768 ...
$ T4_Possible_Power : int 809 809 821 813 800 830 822 835 805 832 ...
$ T5_Possible_Power : int 755 771 736 627 749 832 713 747 780 797 ...
$ T6_Possible_Power : int 745 758 668 717 749 757 638 696 716 749 ...
$ T7_Possible_Power : int 743 811 656 752 723 797 654 736 683 799 ...
$ T1_Total_Active_Power : int 3109970 3110050 3110130 3110211 3110288 3110366 3110447 3110527 3110604 3110682 ...
$ T2_Total_Active_Power : int 609852 609933 610014 610095 610176 610257 610338 610419 610499 610579 ...
$ T3_Total_Active_Power : int 3254759 3254839 3254920 3255002 3255083 3255164 3255244 3255325 3255406 3255487 ...
$ T4_Total_Active_Power : int 3341303 3341384 3341465 3341546 3341628 3341709 3341790 3341870 3341952 3342033 ...
$ T5_Total_Active_Power : int 3230186 3230266 3230346 3230421 3230501 3230582 3230662 3230742 3230823 3230904 ...
$ T6_Total_Active_Power : int 3264175 3264255 3264336 3264417 3264499 3264579 3264660 3264740 3264821 3264902 ...
$ T7_Total_Active_Power : int 3136754 3136835 3136914 3136994 3137075 3137156 3137234 3137314 3137394 3137474 ...
$ mean_wind_mps : num 11.3 12 11.6 11.8 11.2 11.1 12 11.3 11.9 11.2 ...
$ min_wind_mps : num 8.1 9.1 7.1 9.4 8.1 7.1 9 6.9 8.6 7.9 ...
$ max_wind_mps : num 14.5 15.4 16.7 14.2 14.3 14.4 14.8 14.5 15.5 14.8 ...
$ cum_energy_delivered_kwh: int 2510065 2510615 2511165 2511714 2512265 2512815 2513365 2513915 2514465 2515015 ...
summary(dat)
date_time T1_Possible_Power T2_Possible_Power T3_Possible_Power T4_Possible_Power T5_Possible_Power
01-01-13 00:00: 1 Min. : -3.0 Min. : -3.0 Min. : -2.0 Min. : -3.0 Min. : -3.0
01-01-13 00:10: 1 1st Qu.:191.0 1st Qu.:204.0 1st Qu.:214.0 1st Qu.:222.0 1st Qu.:192.0
01-01-13 00:20: 1 Median :518.0 Median :530.0 Median :552.0 Median :598.0 Median :553.0
01-01-13 00:30: 1 Mean :475.9 Mean :483.5 Mean :496.8 Mean :517.6 Mean :495.3
01-01-13 00:40: 1 3rd Qu.:772.0 3rd Qu.:774.0 3rd Qu.:792.0 3rd Qu.:819.0 3rd Qu.:805.0
01-01-13 00:50: 1 Max. :850.0 Max. :850.0 Max. :850.0 Max. :850.0 Max. :850.0
(Other) :52554 NA's :725 NA's :710 NA's :927 NA's :654 NA's :685
T6_Possible_Power T7_Possible_Power T1_Total_Active_Power T2_Total_Active_Power T3_Total_Active_Power
Min. : -2.0 Min. : -6 Min. :3109970 Min. : 609852 Min. :3254759
1st Qu.:206.0 1st Qu.:180 1st Qu.:3895622 1st Qu.:1391641 1st Qu.:4066341
Median :537.0 Median :497 Median :4744043 Median :2341906 Median :5022455
Mean :489.3 Mean :472 Mean :4608851 Mean :2189450 Mean :4870726
3rd Qu.:786.0 3rd Qu.:785 3rd Qu.:5262894 3rd Qu.:2894952 3rd Qu.:5586471
Max. :850.0 Max. :850 Max. :6045048 Max. :3690817 Max. :6413840
NA's :652 NA's :710 NA's :725 NA's :710 NA's :927
T4_Total_Active_Power T5_Total_Active_Power T6_Total_Active_Power T7_Total_Active_Power mean_wind_mps min_wind_mps
Min. :3341303 Min. :3230186 Min. :3264175 Min. :3136754 Min. : 0.00 Min. : 0.0
1st Qu.:4168935 1st Qu.:4023712 1st Qu.:4085929 1st Qu.:3919523 1st Qu.: 5.60 1st Qu.: 3.6
Median :5159420 Median :4986563 Median :5057922 Median :4875312 Median : 8.50 Median : 6.0
Mean :5003720 Mean :4827587 Mean :4903693 Mean :4712335 Mean : 8.08 Mean : 5.6
3rd Qu.:5736883 3rd Qu.:5531891 3rd Qu.:5624685 3rd Qu.:5410488 3rd Qu.:10.90 3rd Qu.: 7.8
Max. :6575858 Max. :6326853 Max. :6451012 Max. :6193951 Max. :18.80 Max. :15.8
NA's :654 NA's :685 NA's :652 NA's :710 NA's :3 NA's :3
max_wind_mps cum_energy_delivered_kwh
Min. : 0.00 Min. : 78
1st Qu.: 7.60 1st Qu.:2741038
Median :11.10 Median :5152314
Mean :10.55 Mean :5052061
3rd Qu.:13.90 3rd Qu.:7152592
Max. :31.50 Max. :9999699
NA's :3 NA's :59
Checking for the Missing Value:
check<-function(df){
# count NA (missing values)
NAs<-sum(is.na(df))
print(paste("Missing Values:", NAs))
# count incomplete records (rows containing missing values)
ok<-complete.cases(df)
print(paste("Incomplete Records:", sum(! ok)))
# Show incomplete records (if less than 100 NAs).
if(NAs > 0 & NAs <= 100) print( df[which(! complete.cases(df)), ] )
# If more than 100, show column-wise distribution of NAs.
if (NAs > 100) hist(which(is.na(df), arr.ind=TRUE)[,2], xlab="Column", freq=TRUE, breaks=1:dim(df)[2], main="Column-wise distribution of missing values")
}
removed<-function(nrow, nrow1){
print(paste("number of records REMOVED:", nrow-nrow1, sep=" "))
print(paste("number of records REMAINING:", nrow1, sep=" "))
}
Now lets Check our Function:
[1] "Missing Values: 10194"
[1] "Incomplete Records: 1377"
Remove missing values
Here we apply the na.omit function (native) and the removed function we just wrote (user defined)
nrow<-dim(dat)[1] # record the dimensions of the data (before removing anything!)
dat<-na.omit(dat) # omit rows containing missing values
nrow1<-dim(dat)[1] # record the new dimensions of the data (after removing NAs)
removed(nrow, nrow1) # check how many records have been removed
[1] "number of records REMOVED: 1377"
[1] "number of records REMAINING: 51183"
Compute energy sentout and energy benchmarks
# compute energy sentout in each timeblock (10min dt) usingz the cumulative energy counter
n<-length(dat$cum_energy_delivered_kwh)#51183
a<-dat$cum_energy_delivered_kwh[1:n-1]#
b<-dat$cum_energy_delivered_kwh[2:n]
diff<-b-a# This is (2nd Row - 1st Row)
dat$energy_sentout_10min_kwh<-c(diff,0) # cannot compute difference for last timestep, set to zero.
# compute kinetic energy in the wind at each windspeed
# Wind Power = (1/2)*rho*area*(velocity)^3 = [kg/m^3]*[m^2]*[m/s]^3 = [kg*m^2/s^3] = [kg*m^2/s^2][1/s] = [Newton-meter]/[second] = [Joules/second] = [Watts]
rho=1.225 # density of wind/Air (kg/m^3)
area=2174 # sweep area of wind turbines (m^2)
turbines=7 # number of turbines
c<-(1/2)*rho*area
dat$wind_power_kw<-c*(dat$mean_wind_mps)^3*turbines/1000 # kW avg power
dat$wind_energy_10min_kwh<-c*(dat$mean_wind_mps)^3*turbines/(1000*6) # kWh in 10 min
# compute betz limit
betz.coef<- 16/27
dat$betz_limit_10min_kwh<-dat$wind_energy_10min_kwh*betz.coef
# compute turbine efficiency
dat$turbine_eff<-dat$energy_sentout_10min_kwh/dat$wind_energy_10min_kwh
# compute total Possible Power
uncurtailed_power<-apply(X=dat[,2:8], MARGIN=1, FUN=sum)
dat$uncurtailed_10min_kwh<-(uncurtailed_power)/6
# compute curtailment
dat$curtailment_10min_kwh<-dat$uncurtailed-dat$energy_sentout_10min_kwh
# Check for NA
check(dat)
[1] "Missing Values: 179"
[1] "Incomplete Records: 179"
# NaN arise when we compute turbine efficiency with zero in the numerator and denominator (due to windspeed = 0 & energy_sentout = 0). Set these to zero.
nan<-which(dat$turbine_eff == "NaN")
# length(nan) # same as number of NAs
# head(dat[nan, ]) # look at the rows containing "NaN"
dat$turbine_eff[nan]<-0
# Likewise, Inf arise when we compute turbine efficiency with zero in the denominator (due to windspeed = 0). Set these to zero.
inf<-which(dat$turbine_eff == "Inf")
# length(inf) # not counted in NA count, beware!
# head(dat[inf, ]) # look at the rows containing "NaN"
dat$turbine_eff[inf]<-0
# check again
check(dat) # NAs have been removed.
[1] "Missing Values: 0"
[1] "Incomplete Records: 0"
# covert modified date_time character string to POSIX
dat$date_time <- as.POSIXlt(as.character(dat$date_time), format="%m-%d-%y %H:%M")
#dat$date_time= mdy_hm(as.character(dat$date_time))
dat$month <- cut(dat$date_time, breaks = "month")
dat$week <- cut(dat$date_time, breaks = "week")
dat$day <- cut(dat$date_time, breaks = "day")
dat$hour <- cut(dat$date_time, breaks = "hour")
# dummy<-strsplit(as.character(week), split=" ")
# week<-lapply(dummy, '[[', 1) # keep the date, drop the time
# dat$week <- week
#
# dummy<-strsplit(as.character(day), split=" ")
# day<-lapply(dummy, '[[', 1) # keep the date, drop the time
# #dat$day<-as.factor(day)
# dat$day<-day
#
# dummy<-strsplit(as.character(hour), split=" ")
# hour<-lapply(dummy, '[[', 2) # keep the time, drop the date
# dat$hour<-as.factor(hour)
Min. 1st Qu. Median Mean 3rd Qu. Max.
-9999000 235 417 24 585 47120
# looks funny...
# number of records (rows) before filtering
nrow<-dim(dat)[1]
# FILTER 1: remove negative energy values
dat<-subset(dat, dat$energy_sentout_10min_kwh >= 0) #
# number of records removed
nrow2<-dim(dat)[1]
removed(nrow, nrow2)
[1] "number of records REMOVED: 0"
[1] "number of records REMAINING: 51181"
# Filter positive energy values based on physical knowledge of the system.
# Filters enumerated in order of increasing stringency:
# FILTER 2: remove energy values beyond what's possible given installed capacity
capacity<-850*7 # 850 KW rated capacity x 7 turbines
capacity_10min_kwh<-capacity*(10/60) # max energy sentout in 10 minutes
dat<-subset(dat, dat$energy_sentout_10min_kwh <= capacity_10min_kwh)
# # FILTER 3: remove statistical outliers: set quantiles to keep
# range<-quantile(dat$energy_sentout_10min_kwh, probs=c(0.01,0.99))
# dat<-subset(dat, dat$energy_sentout_10min_kwh > range[1])
# dat<-subset(dat, dat$energy_sentout_10min_kwh < range[2])
# # FILTER 4: remove energy values beyond what's possible given windspeed (e.g. kinetic energy contained in wind)
# dat<-subset(dat, dat$energy_sentout_10min_kwh <= dat$wind_energy_10min_kwh)
# # FILTER 5: remove energy values beyond what's possible given windspeed AND Betz limit (kinetic energy *16/27)
# dat<-subset(dat, dat$energy_sentout_10min_kwh <= dat$betz_limit_10min_kwh)
# number of records removed
nrow3<-dim(dat)[1]
removed(nrow2, nrow3)
[1] "number of records REMOVED: 39"
[1] "number of records REMAINING: 51142"
# check the quantiles and histogram again
summary(dat$energy_sentout_10min_kwh)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.0 235.0 417.0 411.3 585.0 964.0
hist(dat$energy_sentout_10min_kwh)
# check for outliers in wind data
# quantiles
summary(dat$mean_wind_mps)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.000 5.600 8.500 8.111 10.900 18.800
summary(dat$min_wind_mps)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.000 3.700 6.000 5.624 7.800 15.800
summary(dat$max_wind_mps)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00 7.70 11.10 10.59 13.90 31.50
# historgrams
hist(dat$mean_wind_mps, main="Histogram of Mean Windspeeds")
dat<-subset(dat, mean_wind_mps > 0.5)
wind<-subset(dat, select=c("week", "mean_wind_mps", "max_wind_mps", "min_wind_mps"))
wind<-ddply(wind, .(week), numcolwise(mean))
# plot wind vs time
test<-melt(wind, id.vars=("week"))
ggplot(test, aes(x=week, y=value, group=variable, colour=variable, linetype=variable)) +
geom_line() +
scale_y_continuous(name="Windspeed [m/s]") +
labs(title="Wind Timeseries") +
theme_classic() +
scale_x_discrete(breaks=levels(test$week)[seq(1,360, by=30)], labels=abbreviate)
energy<-subset(dat, select=c("mean_wind_mps", "energy_sentout_10min_kwh", "wind_energy_10min_kwh", "betz_limit_10min_kwh"))
energy<-melt(energy, id.vars=("mean_wind_mps"))
ggplot(energy, aes(x=mean_wind_mps, y=value, group=variable, colour=variable)) +
geom_point() +
scale_y_continuous(name="KWh in 10min", limit=c(0, max(dat$energy_sentout_10min_kwh))) +
scale_x_continuous(name="Windspeed (mps)") +
labs(title="Empirical Power Curve with Betz Limit and Theoretical Wind Energy") +
theme_classic()
dat<-subset(dat, dat$min_wind_mps > 3)
dat<-subset(dat, dat$max_wind_mps < 25)
write.csv(dat, file="clean_VXE_wind_speed.csv")
save(dat, file="clean_VXE_wind_speed.rsav")
filter<-which(dat$mean_wind_mps < 3 | dat$mean_wind_mps > 25)
dat$energy_sentout_10min_kwh[filter]<-0
# fit a distribution to the wind speed data
library(fitdistrplus)
package <U+393C><U+3E31>fitdistrplus<U+393C><U+3E32> was built under R version 3.3.3Loading required package: MASS
package <U+393C><U+3E31>MASS<U+393C><U+3E32> was built under R version 3.3.3
Attaching package: <U+393C><U+3E31>MASS<U+393C><U+3E32>
The following objects are masked from <U+393C><U+3E31>package:fma<U+393C><U+3E32>:
cement, housing, petrol
The following object is masked from <U+393C><U+3E31>package:dplyr<U+393C><U+3E32>:
select
Loading required package: survival
package <U+393C><U+3E31>survival<U+393C><U+3E32> was built under R version 3.3.3
descdist(dat$mean_wind_mps) # heuristic
summary statistics
------
min: 3.4 max: 18.8
median: 9.5
mean: 9.499968
estimated sd: 2.705678
estimated skewness: 0.1801147
estimated kurtosis: 2.493589
# based on heuristic, and knowledge of the system, fit a weibull distribution
weibull.fit<-fitdist(dat$mean_wind_mps, distr="weibull")
summary(weibull.fit)
Fitting of the distribution ' weibull ' by maximum likelihood
Parameters :
estimate Std. Error
shape 3.865554 0.01475029
scale 10.505568 0.01425645
Loglikelihood: -97840.72 AIC: 195685.4 BIC: 195702.7
Correlation matrix:
shape scale
shape 1.0000000 0.3225372
scale 0.3225372 1.0000000
plot(weibull.fit, demp=TRUE)
# summation of energy sentout, divided by the number of hours, yields average power sentout.
avg.power<-sum(dat$energy_sentout_10min_kwh)/(dim(dat)[1]/6)
# summation of energy sentout, divided by the number of hours*capacity, yields capacity factor.
actual.cap.factor<-sum(dat$energy_sentout_10min_kwh)/(850*7*dim(dat)[1]/6)
uncurtailed.cap.factor<-sum(dat$uncurtailed_10min_kwh)/(850*7*dim(dat)[1]/6)
betz.cap.factor<-sum(dat$betz_limit_10min_kwh)/(850*7*dim(dat)[1]/6)
# display the results
data.frame(actual.cap.factor, uncurtailed.cap.factor, betz.cap.factor)
nrow<-dim(dat)[1] # number of complete records BEFORE filtering
# select the numeric or integer class columns
c<-sapply(dat, class) # class of each column
colIndex<-which(c=="numeric" | c=="integer") # column index of numeric or integer vectors
# compute column-wise quantiles
# apply(dat[,-1], 2, stats::quantile)
range<-apply(dat[,colIndex], 2, function(x) {quantile(x, probs=c(.01, .99)) } )
# get names of the columns that we want to filter on (all but the first column)
# names<-colnames(dat[,colIndex])
names<-names(colIndex) # identical to above
# iterate through each column, subsetting the ENTIRE DATA.FRAME by removing records (rows) associated with an outlier value in that column
for(i in 1:dim(range)[2]){
n<-dim(dat)[1] # number of rows at start
dat<-subset(dat, eval(parse(text=names[i])) > range[1,i]) # keep records above 1st percentile
dat<-subset(dat, eval(parse(text=names[i])) < range[2,i]) # keep records less than 99 percentile
p<-dim(dat)[1] # number of rows at end
print(paste(n-p, "records dropped when filtering:", toupper(names[i]), sep=" "))
}
[1] "1422 records dropped when filtering: T1_POSSIBLE_POWER"
[1] "577 records dropped when filtering: T2_POSSIBLE_POWER"
[1] "660 records dropped when filtering: T3_POSSIBLE_POWER"
[1] "1370 records dropped when filtering: T4_POSSIBLE_POWER"
[1] "567 records dropped when filtering: T5_POSSIBLE_POWER"
[1] "210 records dropped when filtering: T6_POSSIBLE_POWER"
[1] "344 records dropped when filtering: T7_POSSIBLE_POWER"
[1] "658 records dropped when filtering: T1_TOTAL_ACTIVE_POWER"
[1] "0 records dropped when filtering: T2_TOTAL_ACTIVE_POWER"
[1] "0 records dropped when filtering: T3_TOTAL_ACTIVE_POWER"
[1] "0 records dropped when filtering: T4_TOTAL_ACTIVE_POWER"
[1] "0 records dropped when filtering: T5_TOTAL_ACTIVE_POWER"
[1] "0 records dropped when filtering: T6_TOTAL_ACTIVE_POWER"
[1] "0 records dropped when filtering: T7_TOTAL_ACTIVE_POWER"
[1] "152 records dropped when filtering: MEAN_WIND_MPS"
[1] "348 records dropped when filtering: MIN_WIND_MPS"
[1] "101 records dropped when filtering: MAX_WIND_MPS"
[1] "732 records dropped when filtering: CUM_ENERGY_DELIVERED_KWH"
[1] "186 records dropped when filtering: ENERGY_SENTOUT_10MIN_KWH"
[1] "0 records dropped when filtering: WIND_POWER_KW"
[1] "0 records dropped when filtering: WIND_ENERGY_10MIN_KWH"
[1] "0 records dropped when filtering: BETZ_LIMIT_10MIN_KWH"
[1] "372 records dropped when filtering: TURBINE_EFF"
[1] "0 records dropped when filtering: UNCURTAILED_10MIN_KWH"
[1] "471 records dropped when filtering: CURTAILMENT_10MIN_KWH"
# number of records removed
nrow2<-dim(dat)[1]
removed(nrow, nrow2)
[1] "number of records REMOVED: 8170"
[1] "number of records REMAINING: 32378"
# based on heuristic, and knowledge of the system, fit a weibull distribution
weibull.fit.2<-fitdist(dat$mean_wind_mps, distr="weibull")
summary(weibull.fit.2)
Fitting of the distribution ' weibull ' by maximum likelihood
Parameters :
estimate Std. Error
shape 4.676697 0.02018640
scale 9.959792 0.01249698
Loglikelihood: -71206.92 AIC: 142417.8 BIC: 142434.6
Correlation matrix:
shape scale
shape 1.0000000 0.3210392
scale 0.3210392 1.0000000
plot(dat$mean_wind_mps, dat$uncurtailed_10min_kwh, col="red", cex=0.2, pch=20, main="Curtailed and Uncurtailed Wind Energy Production\nSao Vicente, Cape Verde (2013)", xlab="Windspeed (mps)", ylab="Energy (kWh/10min)")
points(dat$mean_wind_mps, dat$energy_sentout_10min_kwh, col="green", cex=0.2, pch=20)
legend("topleft", legend=c("Energy Possible (Uncurtailed)", "Energy Sentout (Curtailed)"), col=c("red", "green"), pch=20)
# load the manufacturers power curve
MPC<-read.table(file="v52-850KW-power-curve.csv", header=TRUE, strip.white=TRUE, sep=",")
MPC$design_sentout_10min_kwh<-MPC$power_kW*7/6 # kW x (7 turbines) x (1/6 hour)
MPC<-subset(MPC, windspeed_mps<25)
# plot the manufacturers power curve
plot(x=MPC$windspeed_mps, y=MPC$design_sentout_10min_kwh, type="p", pch=4, main="Manufacturers Power Curve\n with Empirical Overlay", xlab="Windspeed (mps)", ylab="Energy (kWh/10min)", xlim=range(dat$mean_wind_mps))
points(y=dat$energy_sentout_10min_kwh, x=dat$mean_wind_mps, col="green", cex=0.1, pch=20)
points(y=dat$uncurtailed_10min_kwh, x=dat$mean_wind_mps, col="red", cex=0.1, pch=20)
legend("bottomright", col=c("black", "green", "red"), pch=c(4, 20, 20), legend=c("Manufacturers Power Curve", "Energy Sentout (Curtailed)","Energy Possible (Uncurtailed)") )
# For each observation, supply the energy sentout (kWh), find the closest matching value on the power curve (kW), query the corresponding wind speed from the power curve (mps). This becomes the corrected windspeed. Apply this whenever the measured windspeed is less than the windspeed dictated by the power curve. This shifts all points that were left of the power curve, onto the power curve.
# matching vector must be in ascending order
MPC<-MPC[ order(MPC$power_kW, decreasing=FALSE), ]
# pass energy sentout (kWh), return row index of closest matching value on power curve (kW)
power.match<-findInterval(x=dat$energy_sentout_10min_kwh, vec=MPC$power_kW, all.inside=TRUE, rightmost.closed=TRUE)
# read-off corresponding wind speed from the power curve (mps)
corrected.windspeed<-MPC[power.match, c("windspeed_mps")]
# # visually inspect matched values with values matched against --> Looks good!
# plot(x=MPC[power.match, c("windspeed_mps")], y=MPC[power.match, c("power_kW")], col="red", cex=3, xlab="windspeed_mps", ylab="power_kW")
# points(x=MPC$windspeed_mps, y=MPC$power_kW, col="black", cex=0.1)
# create a new column called "corrected_wind_mps"
dat$corrected_wind_mps<-corrected.windspeed
# if the observed mean windspeed is greater than the windspeed dictated by the power curve, that's OK --> keep the recorded mean windspeed.
keep<-which(dat$mean_wind_mps >= dat$corrected_wind_mps)
dat$corrected_wind_mps[keep]<-dat$mean_wind_mps[keep]
# We do this because it is possible that less power was produced at a given windspeed than dictated by the power curve, due to curtailment.
# however it is *not* possible to produce more power at a given windspeed than dictated by the power curve. Hence we correct those values, only.
fitw<-fitdist(dat$mean_wind_mps, "weibull")
summary(fitw)
Fitting of the distribution ' weibull ' by maximum likelihood
Parameters :
estimate Std. Error
shape 4.676697 0.02018640
scale 9.959792 0.01249698
Loglikelihood: -71206.92 AIC: 142417.8 BIC: 142434.6
Correlation matrix:
shape scale
shape 1.0000000 0.3210392
scale 0.3210392 1.0000000
denscomp(fitw, addlegend=FALSE, main="Histogram and Weibull Fit for Uncorrected Windspeeds")
fitw<-fitdist(dat$corrected_wind_mps, "weibull")
summary(fitw)
Fitting of the distribution ' weibull ' by maximum likelihood
Parameters :
estimate Std. Error
shape 3.977839 0.01378950
scale 10.427128 0.01542095
Loglikelihood: -74324.82 AIC: 148653.6 BIC: 148670.4
Correlation matrix:
shape scale
shape 1.0000000 0.3278074
scale 0.3278074 1.0000000
denscomp(fitw, addlegend=FALSE, main="Histogram and Weibull Fit for Corrected Windspeeds")
plot(x=MPC$windspeed_mps, y=MPC$design_sentout_10min_kwh, type="p", pch=4, main="Manufacturers Power Curve\n with Corrected Empirical Overlay", xlab="Windspeed (mps)", ylab="Energy (kWh/10min)", xlim=c(0,20))
points(y=dat$energy_sentout_10min_kwh, x=dat$corrected_wind_mps, col="green", cex=0.1, pch=20)
points(y=dat$uncurtailed_10min_kwh, x=dat$corrected_wind_mps, col="red", cex=0.1, pch=20)
legend("bottomright", col=c("black", "green", "red"), pch=c(4, 20, 20), legend=c("Manufacturers Power Curve", "Curtailed Power Curve (with windspeed correction)","Uncurtailed Power Curve (with windspeed correction)") )