##Load all relvant package and importing data

library(readxl)
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(Metrics)
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following objects are masked from 'package:Metrics':
## 
##     precision, recall
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(readxl)
energy_data <- read_excel("Dataset for the Assessment.xlsx")
str(energy_data)      #To vertify if data been imported successfully
## tibble [200 × 11] (S3: tbl_df/tbl/data.frame)
##  $ Region                     : chr [1:200] "Wales" "England" "Scotland" "Wales" ...
##  $ Renewable_Capacity_MW      : num [1:200] 64.1 336.4 191.5 278.9 458.4 ...
##  $ Electricity_Demand_MWh     : num [1:200] 1465 5782 5866 6737 7535 ...
##  $ Solar_Radiation_kWh_m2     : num [1:200] 6.49 5.68 4.55 7.07 6.42 ...
##  $ Wind_Speed_mps             : num [1:200] 9.39 3.62 7.04 4.61 3.76 ...
##  $ Population_Density         : num [1:200] 143.5 61.9 131.6 312.4 239.6 ...
##  $ GDP_per_Capita             : num [1:200] 14049 77837 71865 74943 79644 ...
##  $ Government_Subsidy_Millions: num [1:200] 1.6 18.79 1.99 11.28 14.47 ...
##  $ CO2_Emissions_MT           : num [1:200] 2.644 2.448 0.226 1.772 1.963 ...
##  $ Renewable_Percentage       : num [1:200] 97.3 54.6 44.9 60.5 55.2 ...
##  $ Adoption                   : num [1:200] 1 0 1 0 1 0 1 0 0 0 ...
head(energy_data)
## # A tibble: 6 × 11
##   Region   Renewable_Capacity_MW Electricity_Demand_MWh Solar_Radiation_kWh_m2
##   <chr>                    <dbl>                  <dbl>                  <dbl>
## 1 Wales                     64.1                  1465.                   6.49
## 2 England                  336.                   5782.                   5.68
## 3 Scotland                 191.                   5866.                   4.55
## 4 Wales                    279.                   6737.                   7.07
## 5 Wales                    458.                   7535.                   6.42
## 6 England                  162.                   9783.                   3.81
## # ℹ 7 more variables: Wind_Speed_mps <dbl>, Population_Density <dbl>,
## #   GDP_per_Capita <dbl>, Government_Subsidy_Millions <dbl>,
## #   CO2_Emissions_MT <dbl>, Renewable_Percentage <dbl>, Adoption <dbl>

###check for missing data in columns

print("Missing Values per Column:")
## [1] "Missing Values per Column:"
colSums(is.na(energy_data))
##                      Region       Renewable_Capacity_MW 
##                           0                           0 
##      Electricity_Demand_MWh      Solar_Radiation_kWh_m2 
##                           0                           0 
##              Wind_Speed_mps          Population_Density 
##                           0                           0 
##              GDP_per_Capita Government_Subsidy_Millions 
##                           0                           0 
##            CO2_Emissions_MT        Renewable_Percentage 
##                           0                           0 
##                    Adoption 
##                           0

###Find Outliers

boxplot(energy_data$Electricity_Demand_MWh,
        main="Outlier Check: Electricity Demand",
        col="tomato", horizontal=TRUE)

###check ‘success’ number for logistic regression

cor_policy <- cor(energy_data$Government_Subsidy_Millions, energy_data$Renewable_Percentage)

Test Correlation between Government Subisdy vs Renewable percentage

cor_policy <- cor(energy_data$Government_Subsidy_Millions, energy_data$Renewable_Percentage)


ggplot(energy_data, aes(x = Government_Subsidy_Millions, y = Renewable_Percentage)) +
  geom_point(color="darkgreen", alpha=0.4) +
  geom_smooth(method="lm", color="red", se=TRUE) +
  labs(title="Government Subsidies vs. Renewable Output",
       subtitle=paste("Correlation Coefficient (r) =", round(cor_policy, 4)),
       x="Government Subsidy (Millions)", 
       y="Renewable Percentage (%)") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

##Test correlation between Wind Speed vs Renewable percentage

cor_wind <- cor(energy_data$Wind_Speed_mps, energy_data$Renewable_Percentage)

ggplot(energy_data, aes(x = Wind_Speed_mps, y = Renewable_Percentage)) +
  geom_point(color="blue", alpha=0.4) +
  geom_smooth(method="lm", color="red", se=TRUE) +
  labs(title="Relationship: Wind Speed vs. Renewable Percentage",
       subtitle=paste("Correlation Coefficient (r) =", round(cor_wind, 4)),
       x="Wind Speed (mps)", 
       y="Renewable Percentage (%)",
       caption="Source: Renewable Energy Dataset") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

Test Lineratity

model_linear <- lm(Renewable_Percentage ~ Government_Subsidy_Millions + 
                     Wind_Speed_mps + Solar_Radiation_kWh_m2 + 
                     GDP_per_Capita + Population_Density, 
                   data = energy_data)
summary(model_linear)
## 
## Call:
## lm(formula = Renewable_Percentage ~ Government_Subsidy_Millions + 
##     Wind_Speed_mps + Solar_Radiation_kWh_m2 + GDP_per_Capita + 
##     Population_Density, data = energy_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -42.844 -21.201   1.302  18.868  43.753 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  5.544e+01  9.716e+00   5.706 4.27e-08 ***
## Government_Subsidy_Millions  6.643e-02  3.000e-01   0.221   0.8250    
## Wind_Speed_mps              -3.266e-01  4.844e-01  -0.674   0.5010    
## Solar_Radiation_kWh_m2       6.683e-01  1.179e+00   0.567   0.5714    
## GDP_per_Capita              -6.644e-05  8.023e-05  -0.828   0.4086    
## Population_Density           2.195e-02  1.266e-02   1.734   0.0844 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 23.4 on 194 degrees of freedom
## Multiple R-squared:  0.02586,    Adjusted R-squared:  0.0007544 
## F-statistic:  1.03 on 5 and 194 DF,  p-value: 0.4012

##Independence Test (Durbin-Watson)

library(car)
dw_results <- durbinWatsonTest(model_linear)
print(dw_results)
##  lag Autocorrelation D-W Statistic p-value
##    1      0.02837267      1.924425    0.57
##  Alternative hypothesis: rho != 0

##Test for Homoscedasticity

library(lmtest)
bptest(model_linear)
## 
##  studentized Breusch-Pagan test
## 
## data:  model_linear
## BP = 8.1265, df = 5, p-value = 0.1494

##Test for Normality

shapiro_results <- shapiro.test(residuals(model_linear))
cat("Test Statistic (W):", round(shapiro_results$statistic, 5), "\n")
## Test Statistic (W): 0.96018
cat("P-Value:           ", format.pval(shapiro_results$p.value), "\n")
## P-Value:            2.0689e-05

##qq plot

plot(model_linear, 2)

#Linear Regression

model_linear <- lm(Renewable_Percentage ~ Solar_Radiation_kWh_m2 + 
                     Wind_Speed_mps + Government_Subsidy_Millions + 
                     GDP_per_Capita + Population_Density, 
                   data = energy_data)
predictions <- predict(model_linear)

##R Square test

r_sq <- summary(model_linear)$r.squared
adj_r_sq <- summary(model_linear)$adj.r.squared

##Mean Absolute Error Test

mae_val <- mae(energy_data$Renewable_Percentage, predictions)
print(mae_val)
## [1] 19.88375

#Logistic Model

model_logistic <- glm(Adoption ~ Solar_Radiation_kWh_m2 + 
                        Wind_Speed_mps + Government_Subsidy_Millions + 
                        GDP_per_Capita + Population_Density, 
                      data = energy_data, 
                      family = binomial)
summary(model_logistic)
## 
## Call:
## glm(formula = Adoption ~ Solar_Radiation_kWh_m2 + Wind_Speed_mps + 
##     Government_Subsidy_Millions + GDP_per_Capita + Population_Density, 
##     family = binomial, data = energy_data)
## 
## Coefficients:
##                               Estimate Std. Error z value Pr(>|z|)  
## (Intercept)                  6.375e-01  8.769e-01   0.727   0.4672  
## Solar_Radiation_kWh_m2       8.406e-02  1.067e-01   0.788   0.4309  
## Wind_Speed_mps              -6.098e-02  4.371e-02  -1.395   0.1630  
## Government_Subsidy_Millions  7.369e-03  2.702e-02   0.273   0.7851  
## GDP_per_Capita              -1.341e-05  7.273e-06  -1.844   0.0652 .
## Population_Density           1.720e-03  1.147e-03   1.500   0.1337  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 266.58  on 199  degrees of freedom
## Residual deviance: 256.73  on 194  degrees of freedom
## AIC: 268.73
## 
## Number of Fisher Scoring iterations: 4

##Predictions

log_probs <- predict(model_logistic, type = "response")
log_preds <- ifelse(log_probs > 0.5, 1, 0)
conf_matrix <- table(Predicted = log_preds, Actual = energy_data$Adoption)
library(knitr)
kable(conf_matrix, caption = "Table 1: Logistic Regression Performance Matrix")
Table 1: Logistic Regression Performance Matrix
0 1
0 19 12
1 58 111

##The Accuracy Matrix

accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
cat("\nOverall Accuracy:", round(accuracy, 4), "\n")
## 
## Overall Accuracy: 0.65