# Load the dataset
data <- read.csv("C:/Users/bedra/OneDrive/Masaüstü/rüzgar/T1.csv")
# Convert date column to datetime format
data$DateTime <- as.POSIXct(data$Date.Time, format="%d %m %Y %H:%M")
data$DateHour <- format(data$DateTime, "%Y-%m-%d %H")
# Aggregate data by hour
hourly <- data %>%
group_by(DateHour) %>%
summarise(
Power = mean(LV.ActivePower..kW., na.rm = TRUE),
WindSpeed = mean(Wind.Speed..m.s., na.rm = TRUE),
TheoreticalPower = sum(Theoretical_Power_Curve..KWh., na.rm = TRUE),
WindDir = atan2(mean(sin(Wind.Direction.... * pi / 180), na.rm = TRUE), mean(cos(Wind.Direction.... * pi / 180), na.rm = TRUE)) * 180 / pi) %>%
ungroup()
# Extract hour of day as a numeric feature
hourly$Hour <- as.integer(substr(hourly$DateHour, 12, 13))
# Create a lagged power feature (previous hour's power)
hourly$Power_lag <- c(NA, head(hourly$Power, -1))
# Remove rows with missing values
hourly <- na.omit(hourly)
# Split data into 80% train and 20% test
n <- nrow(hourly)
train <- hourly[1:round(0.8 * n), ]
test <- hourly[(round(0.8 * n) + 1):n, ]
# Select features for visualization
box_data <- train[, c("Power_lag", "WindSpeed", "WindDir", "TheoreticalPower", "Hour")]
# Reshape to long format for ggplot
box_data_long <- pivot_longer(box_data, cols = everything(), names_to = "Feature", values_to = "Value")
# Plot boxplots for each feature to inspect distributions and outliers
ggplot(box_data_long, aes(x = Feature, y = Value, fill = Feature)) +geom_boxplot(outlier.color = "red", outlier.alpha = 0.5) + facet_wrap(~ Feature, scales = "free") + theme_bw() + theme(legend.position = "none") + labs(title = "Feature Distributions & Outlier Analysis", x = "", y = "Values")

# Train a linear regression model
model <- lm(Power ~ Power_lag + WindSpeed + WindDir + TheoreticalPower + Hour, data = train)
summary(model)
##
## Call:
## lm(formula = Power ~ Power_lag + WindSpeed + WindDir + TheoreticalPower +
## Hour, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3052.00 -53.36 50.94 106.05 1421.02
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.126e+02 1.172e+01 -9.611 < 2e-16 ***
## Power_lag 4.611e-01 6.671e-03 69.108 < 2e-16 ***
## WindSpeed 1.292e+01 2.745e+00 4.706 2.57e-06 ***
## WindDir -1.359e-01 3.784e-02 -3.591 0.000331 ***
## TheoreticalPower 7.849e-02 1.586e-03 49.492 < 2e-16 ***
## Hour 1.434e+00 5.065e-01 2.830 0.004664 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 288 on 6744 degrees of freedom
## Multiple R-squared: 0.9494, Adjusted R-squared: 0.9494
## F-statistic: 2.531e+04 on 5 and 6744 DF, p-value: < 2.2e-16
# Generate predictions on the test set
predictions <- predict(model, newdata = test)
actual <- test$Power
# Calculate error metrics on test set
mae <- mean(abs(actual - predictions))
mask <- actual > 10 # exclude near-zero values for MAPE
mape <- mean(abs((actual[mask] - predictions[mask]) / actual[mask])) * 100
rmse <- sqrt(mean((actual - predictions)^2))
ss_res <- sum((actual - predictions)^2)
ss_tot <- sum((actual - mean(actual))^2)
r2 <- (1 - (ss_res / ss_tot)) * 100
cat("R²:", round(r2, 2), "%\n")
## R²: 95.38 %
cat("ā The model explains", round(r2, 2), "% of the variance. Strong predictive performance.\n")
## ā The model explains 95.38 % of the variance. Strong predictive performance.
cat("MAE:", round(mae, 2), "kW\n")
## MAE: 181.3 kW
cat("ā On average, predictions deviate by", round(mae, 2), "kW from actual values.\n")
## ā On average, predictions deviate by 181.3 kW from actual values.
cat("MAPE:", round(mape, 2), "%\n")
## MAPE: 42.8 %
cat("ā Percentage error is", round(mape, 2), "%; elevated due to low-power operating periods.\n")
## ā Percentage error is 42.8 %; elevated due to low-power operating periods.
cat("RMSE:", round(rmse, 2), "kW\n")
## RMSE: 284.73 kW
cat("ā Root mean square error is", round(rmse, 2), "kW, reflecting sensitivity to large errors.\n")
## ā Root mean square error is 284.73 kW, reflecting sensitivity to large errors.
# Calculate the same metrics on the training set for comparison
train_pred <- predict(model, newdata = train)
train_actual <- train$Power
train_r2 <- summary(model)$r.squared * 100
train_mae <- mean(abs(train_actual - train_pred))
train_mask <- train_actual > 10
train_mape <- mean(abs((train_actual[train_mask] - train_pred[train_mask]) / train_actual[train_mask])) * 100
train_rmse <- sqrt(mean((train_actual - train_pred)^2))
# Print train vs test metrics side by side
metrics_table <- data.frame(Metric = c("R² (%)", "MAE (kW)", "MAPE (%)", "RMSE (kW)"),
Train = c(round(train_r2, 2), round(train_mae, 2), round(train_mape, 2), round(train_rmse, 2)),
Test = c(round(r2, 2), round(mae, 2), round(mape, 2), round(rmse, 2)))
print(metrics_table)
## Metric Train Test
## 1 R² (%) 94.94 95.38
## 2 MAE (kW) 167.36 181.30
## 3 MAPE (%) 40.41 42.80
## 4 RMSE (kW) 287.89 284.73
# Prepare data frame for plotting
plot_data <- data.frame( Index = 1:length(actual), Actual = actual, Predicted = predictions)
# Actual vs Predicted on the same chart
ggplot(plot_data, aes(x = Index)) + geom_line(aes(y = Actual, color = "Actual")) + geom_line(aes(y = Predicted, color = "Predicted")) + labs(title = "Actual vs Predicted Wind Power", y = "Power (kW)", x = "Time Index")

# Actual power only
ggplot(plot_data, aes(x = Index, y = Actual)) + geom_line(color = "blue") + labs(title = "Actual Wind Power", y = "Power (kW)", x = "Time Index")

# Predicted power only
ggplot(plot_data, aes(x = Index, y = Predicted)) + geom_line(color = "red") + labs(title = "Predicted Wind Power", y = "Power (kW)", x = "Time Index")

# Scatter plot: how well predictions align with actuals (ideal = diagonal line)
ggplot(plot_data, aes(x = Actual, y = Predicted)) + geom_point(alpha = 0.3, color = "steelblue") + geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") + labs(title = "Scatter Plot: Actual vs Predicted", x = "Actual Power (kW)", y = "Predicted Power (kW)")

# Residuals over time (should be random around zero)
plot_data$Residuals <- actual - predictions
ggplot(plot_data, aes(x = Index, y = Residuals)) + geom_point(alpha = 0.3, color = "darkgreen") + geom_hline(yintercept = 0, color = "red", linetype = "dashed") + labs(title = "Residuals over Time", x = "Time Index", y = "Residual (kW)")

# Residual distribution (should be roughly bell-shaped and centered at zero)
ggplot(plot_data, aes(x = Residuals)) + geom_histogram(binwidth = 50, fill = "steelblue", color = "white") + geom_vline(xintercept = 0, color = "red", linetype = "dashed") + labs(title = "Residual Distribution", x = "Residual (kW)", y = "Frequency")

# Bar chart of model coefficients (shows each feature's influence)
coef_data <- data.frame(Feature = names(coef(model))[-1], Coefficient = coef(model)[-1])
ggplot(coef_data, aes(x = reorder(Feature, Coefficient), y = Coefficient)) + geom_bar(stat = "identity", fill = "steelblue") + coord_flip() + labs(title = "Feature Coefficients", x = "Feature", y = "Coefficient Value")
