##Load all relvant package and importing data
library(readxl)
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(Metrics)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following objects are masked from 'package:Metrics':
##
## precision, recall
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(readxl)
energy_data <- read_excel("Dataset for the Assessment.xlsx")
str(energy_data) #To vertify if data been imported successfully
## tibble [200 × 11] (S3: tbl_df/tbl/data.frame)
## $ Region : chr [1:200] "Wales" "England" "Scotland" "Wales" ...
## $ Renewable_Capacity_MW : num [1:200] 64.1 336.4 191.5 278.9 458.4 ...
## $ Electricity_Demand_MWh : num [1:200] 1465 5782 5866 6737 7535 ...
## $ Solar_Radiation_kWh_m2 : num [1:200] 6.49 5.68 4.55 7.07 6.42 ...
## $ Wind_Speed_mps : num [1:200] 9.39 3.62 7.04 4.61 3.76 ...
## $ Population_Density : num [1:200] 143.5 61.9 131.6 312.4 239.6 ...
## $ GDP_per_Capita : num [1:200] 14049 77837 71865 74943 79644 ...
## $ Government_Subsidy_Millions: num [1:200] 1.6 18.79 1.99 11.28 14.47 ...
## $ CO2_Emissions_MT : num [1:200] 2.644 2.448 0.226 1.772 1.963 ...
## $ Renewable_Percentage : num [1:200] 97.3 54.6 44.9 60.5 55.2 ...
## $ Adoption : num [1:200] 1 0 1 0 1 0 1 0 0 0 ...
head(energy_data)
## # A tibble: 6 × 11
## Region Renewable_Capacity_MW Electricity_Demand_MWh Solar_Radiation_kWh_m2
## <chr> <dbl> <dbl> <dbl>
## 1 Wales 64.1 1465. 6.49
## 2 England 336. 5782. 5.68
## 3 Scotland 191. 5866. 4.55
## 4 Wales 279. 6737. 7.07
## 5 Wales 458. 7535. 6.42
## 6 England 162. 9783. 3.81
## # ℹ 7 more variables: Wind_Speed_mps <dbl>, Population_Density <dbl>,
## # GDP_per_Capita <dbl>, Government_Subsidy_Millions <dbl>,
## # CO2_Emissions_MT <dbl>, Renewable_Percentage <dbl>, Adoption <dbl>
###check for missing data in columns
print("Missing Values per Column:")
## [1] "Missing Values per Column:"
colSums(is.na(energy_data))
## Region Renewable_Capacity_MW
## 0 0
## Electricity_Demand_MWh Solar_Radiation_kWh_m2
## 0 0
## Wind_Speed_mps Population_Density
## 0 0
## GDP_per_Capita Government_Subsidy_Millions
## 0 0
## CO2_Emissions_MT Renewable_Percentage
## 0 0
## Adoption
## 0
###Find Outliers
boxplot(energy_data$Electricity_Demand_MWh,
main="Outlier Check: Electricity Demand",
col="tomato", horizontal=TRUE)
###check ‘success’ number for logistic regression
cor_policy <- cor(energy_data$Government_Subsidy_Millions, energy_data$Renewable_Percentage)
cor_policy <- cor(energy_data$Government_Subsidy_Millions, energy_data$Renewable_Percentage)
ggplot(energy_data, aes(x = Government_Subsidy_Millions, y = Renewable_Percentage)) +
geom_point(color="darkgreen", alpha=0.4) +
geom_smooth(method="lm", color="red", se=TRUE) +
labs(title="Government Subsidies vs. Renewable Output",
subtitle=paste("Correlation Coefficient (r) =", round(cor_policy, 4)),
x="Government Subsidy (Millions)",
y="Renewable Percentage (%)") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
##Test correlation between Wind Speed vs Renewable percentage
cor_wind <- cor(energy_data$Wind_Speed_mps, energy_data$Renewable_Percentage)
ggplot(energy_data, aes(x = Wind_Speed_mps, y = Renewable_Percentage)) +
geom_point(color="blue", alpha=0.4) +
geom_smooth(method="lm", color="red", se=TRUE) +
labs(title="Relationship: Wind Speed vs. Renewable Percentage",
subtitle=paste("Correlation Coefficient (r) =", round(cor_wind, 4)),
x="Wind Speed (mps)",
y="Renewable Percentage (%)",
caption="Source: Renewable Energy Dataset") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
model_linear <- lm(Renewable_Percentage ~ Government_Subsidy_Millions +
Wind_Speed_mps + Solar_Radiation_kWh_m2 +
GDP_per_Capita + Population_Density,
data = energy_data)
summary(model_linear)
##
## Call:
## lm(formula = Renewable_Percentage ~ Government_Subsidy_Millions +
## Wind_Speed_mps + Solar_Radiation_kWh_m2 + GDP_per_Capita +
## Population_Density, data = energy_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -42.844 -21.201 1.302 18.868 43.753
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.544e+01 9.716e+00 5.706 4.27e-08 ***
## Government_Subsidy_Millions 6.643e-02 3.000e-01 0.221 0.8250
## Wind_Speed_mps -3.266e-01 4.844e-01 -0.674 0.5010
## Solar_Radiation_kWh_m2 6.683e-01 1.179e+00 0.567 0.5714
## GDP_per_Capita -6.644e-05 8.023e-05 -0.828 0.4086
## Population_Density 2.195e-02 1.266e-02 1.734 0.0844 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 23.4 on 194 degrees of freedom
## Multiple R-squared: 0.02586, Adjusted R-squared: 0.0007544
## F-statistic: 1.03 on 5 and 194 DF, p-value: 0.4012
##Independence Test (Durbin-Watson)
library(car)
dw_results <- durbinWatsonTest(model_linear)
print(dw_results)
## lag Autocorrelation D-W Statistic p-value
## 1 0.02837267 1.924425 0.57
## Alternative hypothesis: rho != 0
##Test for Homoscedasticity
library(lmtest)
bptest(model_linear)
##
## studentized Breusch-Pagan test
##
## data: model_linear
## BP = 8.1265, df = 5, p-value = 0.1494
##Test for Normality
shapiro_results <- shapiro.test(residuals(model_linear))
cat("Test Statistic (W):", round(shapiro_results$statistic, 5), "\n")
## Test Statistic (W): 0.96018
cat("P-Value: ", format.pval(shapiro_results$p.value), "\n")
## P-Value: 2.0689e-05
##qq plot
plot(model_linear, 2)
#Linear Regression
model_linear <- lm(Renewable_Percentage ~ Solar_Radiation_kWh_m2 +
Wind_Speed_mps + Government_Subsidy_Millions +
GDP_per_Capita + Population_Density,
data = energy_data)
predictions <- predict(model_linear)
##R Square test
r_sq <- summary(model_linear)$r.squared
adj_r_sq <- summary(model_linear)$adj.r.squared
##Mean Absolute Error Test
mae_val <- mae(energy_data$Renewable_Percentage, predictions)
print(mae_val)
## [1] 19.88375
#Logistic Model
model_logistic <- glm(Adoption ~ Solar_Radiation_kWh_m2 +
Wind_Speed_mps + Government_Subsidy_Millions +
GDP_per_Capita + Population_Density,
data = energy_data,
family = binomial)
summary(model_logistic)
##
## Call:
## glm(formula = Adoption ~ Solar_Radiation_kWh_m2 + Wind_Speed_mps +
## Government_Subsidy_Millions + GDP_per_Capita + Population_Density,
## family = binomial, data = energy_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 6.375e-01 8.769e-01 0.727 0.4672
## Solar_Radiation_kWh_m2 8.406e-02 1.067e-01 0.788 0.4309
## Wind_Speed_mps -6.098e-02 4.371e-02 -1.395 0.1630
## Government_Subsidy_Millions 7.369e-03 2.702e-02 0.273 0.7851
## GDP_per_Capita -1.341e-05 7.273e-06 -1.844 0.0652 .
## Population_Density 1.720e-03 1.147e-03 1.500 0.1337
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 266.58 on 199 degrees of freedom
## Residual deviance: 256.73 on 194 degrees of freedom
## AIC: 268.73
##
## Number of Fisher Scoring iterations: 4
##Predictions
log_probs <- predict(model_logistic, type = "response")
log_preds <- ifelse(log_probs > 0.5, 1, 0)
conf_matrix <- table(Predicted = log_preds, Actual = energy_data$Adoption)
library(knitr)
kable(conf_matrix, caption = "Table 1: Logistic Regression Performance Matrix")
| 0 | 1 | |
|---|---|---|
| 0 | 19 | 12 |
| 1 | 58 | 111 |
##The Accuracy Matrix
accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
cat("\nOverall Accuracy:", round(accuracy, 4), "\n")
##
## Overall Accuracy: 0.65