# Setup: load required libraries
knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
# Load the dataset
data <- read.csv("C:/Users/bedra/OneDrive/Masaüstü/rüzgar/T1.csv")
# Convert date column to datetime format
data$DateTime <- as.POSIXct(data$Date.Time, format="%d %m %Y %H:%M")
data$DateHour <- format(data$DateTime, "%Y-%m-%d %H")
# Aggregate data by hour
hourly <- data %>%
  group_by(DateHour) %>%
  summarise(
    Power            = mean(LV.ActivePower..kW., na.rm = TRUE),
    WindSpeed        = mean(Wind.Speed..m.s., na.rm = TRUE),
    TheoreticalPower = sum(Theoretical_Power_Curve..KWh., na.rm = TRUE),
    WindDir          = atan2(mean(sin(Wind.Direction.... * pi / 180), na.rm = TRUE), mean(cos(Wind.Direction.... * pi / 180), na.rm = TRUE)) * 180 / pi) %>%
  ungroup()
# Extract hour of day as a numeric feature
hourly$Hour <- as.integer(substr(hourly$DateHour, 12, 13))
# Create a lagged power feature (previous hour's power)
hourly$Powerlag <- c(NA, head(hourly$Power, -1))
# Remove rows with missing values
hourly <- na.omit(hourly)
# Split data into 80% train and 20% test
n <- nrow(hourly)
train <- hourly[1:round(0.8 * n), ]
test <- hourly[(round(0.8 * n) + 1):n, ]
# Correlation heatmap to check feature relationships
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.5.3
## corrplot 0.95 loaded
corr_data <- train[, c("Power", "Powerlag", "WindSpeed", "WindDir", "TheoreticalPower", "Hour")]
corr_matrix <- cor(corr_data)
corrplot(corr_matrix, method = "color", type = "upper", addCoef.col = "black", tl.col = "black", number.cex = 0.7)

# Select features for visualization
box_data <- train[, c("Powerlag", "WindSpeed", "WindDir", "TheoreticalPower", "Hour")]
# Reshape to long format for ggplot
box_data_long <- pivot_longer(box_data, cols = everything(), names_to = "Feature", values_to = "Value")
# Plot boxplots for each feature to inspect distributions and outliers
ggplot(box_data_long, aes(x = Feature, y = Value, fill = Feature)) +geom_boxplot(outlier.color = "red", outlier.alpha = 0.5) + facet_wrap(~ Feature, scales = "free") + theme_bw() + theme(legend.position = "none") + labs(title = "Feature Distributions & Outlier Analysis", x = "", y = "Values")

# Train a linear regression model
model <- lm(Power ~ Powerlag + WindSpeed + WindDir + TheoreticalPower + Hour, data = train)
summary(model)
## 
## Call:
## lm(formula = Power ~ Powerlag + WindSpeed + WindDir + TheoreticalPower + 
##     Hour, data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -3052.00   -53.36    50.94   106.05  1421.02 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -1.126e+02  1.172e+01  -9.611  < 2e-16 ***
## Powerlag          4.611e-01  6.671e-03  69.108  < 2e-16 ***
## WindSpeed         1.292e+01  2.745e+00   4.706 2.57e-06 ***
## WindDir          -1.359e-01  3.784e-02  -3.591 0.000331 ***
## TheoreticalPower  7.849e-02  1.586e-03  49.492  < 2e-16 ***
## Hour              1.434e+00  5.065e-01   2.830 0.004664 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 288 on 6744 degrees of freedom
## Multiple R-squared:  0.9494, Adjusted R-squared:  0.9494 
## F-statistic: 2.531e+04 on 5 and 6744 DF,  p-value: < 2.2e-16
# Generate predictions on the test set
predictions <- predict(model, newdata = test)
actual <- test$Power
# Calculate error metrics on test set
mae <- mean(abs(actual - predictions))
mask <- actual > 10 # exclude near-zero values for MAPE
mape <- mean(abs((actual[mask] - predictions[mask]) / actual[mask])) * 100
rmse <- sqrt(mean((actual - predictions)^2))
ss_res <- sum((actual - predictions)^2)
ss_tot <- sum((actual - mean(actual))^2)
r2 <- (1 - (ss_res / ss_tot)) * 100

cat("R²:", round(r2, 2), "%\n")
## R²: 95.38 %
cat("→ The model explains", round(r2, 2), "% of the variance. Strong predictive performance.\n")
## → The model explains 95.38 % of the variance. Strong predictive performance.
cat("MAE:", round(mae, 2), "kW\n")
## MAE: 181.3 kW
cat("→ On average, predictions deviate by", round(mae, 2), "kW from actual values.\n")
## → On average, predictions deviate by 181.3 kW from actual values.
cat("MAPE:", round(mape, 2), "%\n")
## MAPE: 42.8 %
cat("→ Percentage error is", round(mape, 2), "%; elevated due to low-power operating periods.\n")
## → Percentage error is 42.8 %; elevated due to low-power operating periods.
cat("RMSE:", round(rmse, 2), "kW\n")
## RMSE: 284.73 kW
cat("→ Root mean square error is", round(rmse, 2), "kW, reflecting sensitivity to large errors.\n")
## → Root mean square error is 284.73 kW, reflecting sensitivity to large errors.
# Calculate the same metrics on the training set for comparison
train_pred <- predict(model, newdata = train)
train_actual <- train$Power

train_r2   <- summary(model)$r.squared * 100
train_mae  <- mean(abs(train_actual - train_pred))
train_mask <- train_actual > 10  
train_mape <- mean(abs((train_actual[train_mask] - train_pred[train_mask]) / train_actual[train_mask])) * 100
train_rmse <- sqrt(mean((train_actual - train_pred)^2)) 

# Print train vs test metrics side by side
metrics_table <- data.frame(Metric = c("R² (%)", "MAE (kW)", "MAPE (%)", "RMSE (kW)"),
Train  = c(round(train_r2, 2), round(train_mae, 2), round(train_mape, 2), round(train_rmse, 2)),
Test   = c(round(r2, 2), round(mae, 2), round(mape, 2), round(rmse, 2)))
print(metrics_table)
##      Metric  Train   Test
## 1    R² (%)  94.94  95.38
## 2  MAE (kW) 167.36 181.30
## 3  MAPE (%)  40.41  42.80
## 4 RMSE (kW) 287.89 284.73
# Prepare data frame for plotting
plot_data <- data.frame( Index = 1:length(actual), Actual = actual, Predicted = predictions)

# Actual vs Predicted on the same chart
ggplot(plot_data, aes(x = Index)) + geom_line(aes(y = Actual, color = "Actual")) + geom_line(aes(y = Predicted, color = "Predicted")) + labs(title = "Actual vs Predicted Wind Power", y = "Power (kW)", x = "Time Index")

# Actual power only
ggplot(plot_data, aes(x = Index, y = Actual)) + geom_line(color = "blue") + labs(title = "Actual Wind Power", y = "Power (kW)", x = "Time Index")

# Predicted power only
ggplot(plot_data, aes(x = Index, y = Predicted)) + geom_line(color = "red") + labs(title = "Predicted Wind Power", y = "Power (kW)", x = "Time Index")

# Scatter plot: how well predictions align with actuals (ideal = diagonal line)
ggplot(plot_data, aes(x = Actual, y = Predicted)) + geom_point(alpha = 0.3, color = "steelblue") + geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") + labs(title = "Scatter Plot: Actual vs Predicted", x = "Actual Power (kW)", y = "Predicted Power (kW)")

# Residuals over time (should be random around zero)
plot_data$Residuals <- actual - predictions
ggplot(plot_data, aes(x = Index, y = Residuals)) + geom_point(alpha = 0.3, color = "darkgreen") + geom_hline(yintercept = 0, color = "red", linetype = "dashed") + labs(title = "Residuals over Time", x = "Time Index", y = "Residual (kW)")

# Residual distribution (should be roughly bell-shaped and centered at zero)
ggplot(plot_data, aes(x = Residuals)) + geom_histogram(binwidth = 50, fill = "steelblue", color = "white") + geom_vline(xintercept = 0, color = "red", linetype = "dashed") + labs(title = "Residual Distribution", x = "Residual (kW)", y = "Frequency")

# Bar chart of model coefficients (shows each feature's influence)
coef_data <- data.frame(Feature = names(coef(model))[-1], Coefficient = coef(model)[-1])

ggplot(coef_data, aes(x = reorder(Feature, Coefficient), y = Coefficient)) + geom_bar(stat = "identity", fill = "steelblue") + coord_flip() + labs(title = "Feature Coefficients", x = "Feature", y = "Coefficient Value")