library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggthemes)
library(ggrepel)
setwd("C:/Users/kaitl/OneDrive/Documents/590_Working")

#update data types of dataframe
energy <- read_delim("./590_FinalData1.csv", delim = ",", col_types = "nccnncnnnnnnnn")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
energy1 <- energy
energy1[energy1 == '..'] <- NA

Select a continuous column and call it response variable

Select categorical column and call it explanatory variable

res_var <- energy$energy_intensity
exp_var <- energy$full_pop_electricity_access

Hypothesis and ANOVA test given the situation

Hypothesis: A higher energy intensity is apparent when the entire population has electricity access

energy1 |>
  group_by(full_pop_electricity_access) |>
  summarise(mean(energy_intensity, na.rm = TRUE), sd(energy_intensity, na.rm = TRUE))
## # A tibble: 2 × 3
##   full_pop_electricity_access mean(energy_intensity, na…¹ sd(energy_intensity,…²
##   <chr>                                             <dbl>                  <dbl>
## 1 FALSE                                              7.51                   6.63
## 2 TRUE                                               5.82                   3.13
## # ℹ abbreviated names: ¹​`mean(energy_intensity, na.rm = TRUE)`,
## #   ²​`sd(energy_intensity, na.rm = TRUE)`

Interestingly, this tells us that the countries that have full electricity access, have a smaller energy intensity mean that those without full electricity access.

F-Test

n <- nrow(energy)
k <- n_distinct(energy$full_pop_electricity_access)

ggplot() +
  geom_function(xlim = c(0, 2), 
                fun = \(x) df(x, k - 1, n - k)) +
  geom_vline(xintercept = 1, color = 'orange') +
  labs(title = 'F Distribution for Energy full population electricity access',
       x = "F Values",
       y = "Probability Density") +
  theme_hc()

This does not look like a normal F-test Distribution. The F value is not close to 1 at all, suggesting the energy intensity between populations will 100% access to electricity and those without 100% access will not see much of a difference between the two.

m <- aov(energy_intensity ~ full_pop_electricity_access, data = energy1)
summary(m)
##                               Df Sum Sq Mean Sq F value Pr(>F)    
## full_pop_electricity_access    1   2851  2850.6   82.83 <2e-16 ***
## Residuals                   4945 170190    34.4                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 2046 observations deleted due to missingness

I believe the p-value is small here, so the means are unlikely to all be equal.

pairwise.t.test(energy1$energy_intensity, energy1$full_pop_electricity_access, p.adjust.method = "bonferroni")
## 
##  Pairwise comparisons using t tests with pooled SD 
## 
## data:  energy1$energy_intensity and energy1$full_pop_electricity_access 
## 
##      FALSE 
## TRUE <2e-16
## 
## P value adjustment method: bonferroni
# # we use a "_" so we don't overwrite the original function
# boot_ci <- function (v, func = median, conf = 0.95, n_iter = 100) {
#   # the function we want to run on each iteration i
#   boot_func <- \(x, i) func(x[i])
#   
#   # the boot instance, running the function on each iteration
#   b <- boot(v, boot_func, R = n_iter)
#   b <- boot.ci(b, conf = conf, type = "perc")
#   
#   # return just the lower and upper values
#   return(c("lower" = b$percent[4],
#            "upper" = b$percent[5]))
# }
# df_ci <- energy1 |>
#   group_by(full_pop_electricity_access) |>
#   summarise(ci_lower = boot_ci(energy_intensity, mean)['lower'],
#             mean_price = mean(energy_intensity),
#             ci_upper = boot_ci(energy_intensity, mean)['upper'])
# 
# df_ci

There is an error here^^^

# df_ci |>
#   ggplot() +
#   geom_errorbarh(mapping = aes(y = full_pop_electricity_access, 
#                                xmin=ci_lower, xmax=ci_upper,
#                                color = '95% C.I.'), height = 0.5) +
#   geom_point(mapping = aes(x = mean_price, y = energy_intensity,
#                            color = 'Group Mean'),
#              shape = '|',
#              size = 5) +
#   scale_color_manual(values=c('black', 'orange')) +
#   theme_minimal() +
#   labs(title = "Energy intensity By Full Population Electricity Access",
#        x = "Energy Intensity",
#        y = "Full Population Access",
#        color = '')

There is not enough data to note if countries with full access to electricity typically have a higher energy intensity value.

Another Example

Instead of energy intensity, I swapped for the total electricity output. Fortunately, this may show us that countries with full electricity access have a larger electricity output, just not a higher energy intensity.

energy1 |>
  group_by(full_pop_electricity_access) |>
  summarise(mean(total_elec_output, na.rm = TRUE), sd(total_elec_output, na.rm = TRUE))
## # A tibble: 2 × 3
##   full_pop_electricity_access mean(total_elec_output, n…¹ sd(total_elec_output…²
##   <chr>                                             <dbl>                  <dbl>
## 1 FALSE                                            38703.                232227.
## 2 TRUE                                            167219.                532273.
## # ℹ abbreviated names: ¹​`mean(total_elec_output, na.rm = TRUE)`,
## #   ²​`sd(total_elec_output, na.rm = TRUE)`

Evaluate the fit of a linear regression model on this column of data:

Linear shape?

energy1 |>
  ggplot(mapping = aes(x = full_pop_electricity_access, y = total_elec_output)) +
  geom_point(size = 2, color = 'darkblue') +
  theme_minimal()
## Warning: Removed 1108 rows containing missing values (`geom_point()`).

Linear Regression

# lm_ <- \(x) (1 / 2) * x + 3
# 
# energy1 |>
#   ggplot(mapping = aes(x = full_pop_electricity_access, y = total_elec_output)) +
#   geom_point(size = 2) +
#   geom_segment(mapping = aes(xend = full_pop_electricity_access, y = total_elec_output, yend = lm_(full_pop_electricity_access), 
#                              color = 'error'), linewidth = 1) +
#   geom_smooth(method = "lm", se = FALSE, color = 'red', linewidth = 0.5) + 
#   scale_color_manual(values = c('darkblue')) +
#   labs(color = '') +
#   theme_minimal()