data <- read.csv("C:/Users/bedra/OneDrive/Masaüstü/rüzgar/T1.csv")
data$DateTime <- as.POSIXct(data$Date.Time, format="%d %m %Y %H:%M")
data$DateHour <- format(data$DateTime, "%Y-%m-%d %H")
hourly <- data %>%
group_by(DateHour) %>%
summarise(
Power = mean(LV.ActivePower..kW., na.rm = TRUE),
WindSpeed = mean(Wind.Speed..m.s., na.rm = TRUE),
TheoreticalPower = sum(Theoretical_Power_Curve..KWh., na.rm = TRUE),
WindDir = atan2(mean(sin(Wind.Direction.... * pi / 180), na.rm = TRUE), mean(cos(Wind.Direction.... * pi / 180), na.rm = TRUE)) * 180 / pi) %>%
ungroup()
hourly$Hour <- as.integer(substr(hourly$DateHour, 12, 13))
hourly$Power_lag <- c(NA, head(hourly$Power, -1))
hourly <- na.omit(hourly)
n <- nrow(hourly)
train <- hourly[1:round(0.8 * n), ]
test <- hourly[(round(0.8 * n) + 1):n, ]
model <- lm(Power ~ Power_lag + WindSpeed + WindDir + TheoreticalPower + Hour, data = train)
summary(model)
##
## Call:
## lm(formula = Power ~ Power_lag + WindSpeed + WindDir + TheoreticalPower +
## Hour, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3052.00 -53.36 50.94 106.05 1421.02
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.126e+02 1.172e+01 -9.611 < 2e-16 ***
## Power_lag 4.611e-01 6.671e-03 69.108 < 2e-16 ***
## WindSpeed 1.292e+01 2.745e+00 4.706 2.57e-06 ***
## WindDir -1.359e-01 3.784e-02 -3.591 0.000331 ***
## TheoreticalPower 7.849e-02 1.586e-03 49.492 < 2e-16 ***
## Hour 1.434e+00 5.065e-01 2.830 0.004664 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 288 on 6744 degrees of freedom
## Multiple R-squared: 0.9494, Adjusted R-squared: 0.9494
## F-statistic: 2.531e+04 on 5 and 6744 DF, p-value: < 2.2e-16
predictions <- predict(model, newdata = test)
actual <- test$Power
mae <- mean(abs(actual - predictions))
mask <- actual > 10
mape <- mean(abs((actual[mask] - predictions[mask]) / actual[mask])) * 100
rmse <- sqrt(mean((actual - predictions)^2))
ss_res <- sum((actual - predictions)^2)
ss_tot <- sum((actual - mean(actual))^2)
r2 <- (1 - (ss_res / ss_tot)) * 100
cat("R²:", round(r2, 2), "%\n")
## R²: 95.38 %
cat("→ The model explains", round(r2, 2), "% of the variance. Strong predictive performance.\n")
## → The model explains 95.38 % of the variance. Strong predictive performance.
cat("MAE:", round(mae, 2), "kW\n")
## MAE: 181.3 kW
cat("→ On average, predictions deviate by", round(mae, 2), "kW from actual values.\n")
## → On average, predictions deviate by 181.3 kW from actual values.
cat("MAPE:", round(mape, 2), "%\n")
## MAPE: 42.8 %
cat("→ Percentage error is", round(mape, 2), "%; elevated due to low-power operating periods.\n")
## → Percentage error is 42.8 %; elevated due to low-power operating periods.
cat("RMSE:", round(rmse, 2), "kW\n")
## RMSE: 284.73 kW
cat("→ Root mean square error is", round(rmse, 2), "kW, reflecting sensitivity to large errors.\n")
## → Root mean square error is 284.73 kW, reflecting sensitivity to large errors.
train_pred <- predict(model, newdata = train)
train_actual <- train$Power
train_r2 <- summary(model)$r.squared * 100
train_mae <- mean(abs(train_actual - train_pred))
train_mask <- train_actual > 10
train_mape <- mean(abs((train_actual[train_mask] - train_pred[train_mask]) / train_actual[train_mask])) * 100
train_rmse <- sqrt(mean((train_actual - train_pred)^2))
metrics_table <- data.frame(Metric = c("R² (%)", "MAE (kW)", "MAPE (%)", "RMSE (kW)"),
Train = c(round(train_r2, 2), round(train_mae, 2), round(train_mape, 2), round(train_rmse, 2)),
Test = c(round(r2, 2), round(mae, 2), round(mape, 2), round(rmse, 2)))
print(metrics_table)
## Metric Train Test
## 1 R² (%) 94.94 95.38
## 2 MAE (kW) 167.36 181.30
## 3 MAPE (%) 40.41 42.80
## 4 RMSE (kW) 287.89 284.73
plot_data <- data.frame( Index = 1:length(actual), Actual = actual, Predicted = predictions)
ggplot(plot_data, aes(x = Index)) + geom_line(aes(y = Actual, color = "Actual")) + geom_line(aes(y = Predicted, color = "Predicted")) + labs(title = "Actual vs Predicted Wind Power", y = "Power (kW)", x = "Time Index")

ggplot(plot_data, aes(x = Index, y = Actual)) + geom_line(color = "blue") + labs(title = "Actual Wind Power", y = "Power (kW)", x = "Time Index")

ggplot(plot_data, aes(x = Index, y = Predicted)) + geom_line(color = "red") + labs(title = "Predicted Wind Power", y = "Power (kW)", x = "Time Index")

ggplot(plot_data, aes(x = Actual, y = Predicted)) + geom_point(alpha = 0.3, color = "steelblue") + geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") + labs(title = "Scatter Plot: Actual vs Predicted", x = "Actual Power (kW)", y = "Predicted Power (kW)")

plot_data$Residuals <- actual - predictions
ggplot(plot_data, aes(x = Index, y = Residuals)) + geom_point(alpha = 0.3, color = "darkgreen") + geom_hline(yintercept = 0, color = "red", linetype = "dashed") + labs(title = "Residuals over Time", x = "Time Index", y = "Residual (kW)")

ggplot(plot_data, aes(x = Residuals)) + geom_histogram(binwidth = 50, fill = "steelblue", color = "white") + geom_vline(xintercept = 0, color = "red", linetype = "dashed") + labs(title = "Residual Distribution", x = "Residual (kW)", y = "Frequency")

coef_data <- data.frame(Feature = names(coef(model))[-1], Coefficient = coef(model)[-1])
ggplot(coef_data, aes(x = reorder(Feature, Coefficient), y = Coefficient)) + geom_bar(stat = "identity", fill = "steelblue") + coord_flip() + labs(title = "Feature Coefficients", x = "Feature", y = "Coefficient Value")
