library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggthemes)
library(ggrepel)
setwd("C:/Users/kaitl/OneDrive/Documents/590_Working")
#update data types of dataframe
energy <- read_delim("./590_FinalData1.csv", delim = ",", col_types = "nccnncnnnnnnnn")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
energy1 <- energy
energy1[energy1 == '..'] <- NA
Select a continuous column and call it response variable
Select categorical column and call it explanatory variable
res_var <- energy$energy_intensity
exp_var <- energy$full_pop_electricity_access
Hypothesis and ANOVA test given the situation
Hypothesis: A higher energy intensity is apparent when the entire population has electricity access
energy1 |>
group_by(full_pop_electricity_access) |>
summarise(mean(energy_intensity, na.rm = TRUE), sd(energy_intensity, na.rm = TRUE))
## # A tibble: 2 × 3
## full_pop_electricity_access mean(energy_intensity, na…¹ sd(energy_intensity,…²
## <chr> <dbl> <dbl>
## 1 FALSE 7.51 6.63
## 2 TRUE 5.82 3.13
## # ℹ abbreviated names: ¹`mean(energy_intensity, na.rm = TRUE)`,
## # ²`sd(energy_intensity, na.rm = TRUE)`
Interestingly, this tells us that the countries that have full electricity access, have a smaller energy intensity mean that those without full electricity access.
F-Test
n <- nrow(energy)
k <- n_distinct(energy$full_pop_electricity_access)
ggplot() +
geom_function(xlim = c(0, 2),
fun = \(x) df(x, k - 1, n - k)) +
geom_vline(xintercept = 1, color = 'orange') +
labs(title = 'F Distribution for Energy full population electricity access',
x = "F Values",
y = "Probability Density") +
theme_hc()
This does not look like a normal F-test Distribution. The F value is not close to 1 at all, suggesting the energy intensity between populations will 100% access to electricity and those without 100% access will not see much of a difference between the two.
m <- aov(energy_intensity ~ full_pop_electricity_access, data = energy1)
summary(m)
## Df Sum Sq Mean Sq F value Pr(>F)
## full_pop_electricity_access 1 2851 2850.6 82.83 <2e-16 ***
## Residuals 4945 170190 34.4
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 2046 observations deleted due to missingness
I believe the p-value is small here, so the means are unlikely to all be equal.
pairwise.t.test(energy1$energy_intensity, energy1$full_pop_electricity_access, p.adjust.method = "bonferroni")
##
## Pairwise comparisons using t tests with pooled SD
##
## data: energy1$energy_intensity and energy1$full_pop_electricity_access
##
## FALSE
## TRUE <2e-16
##
## P value adjustment method: bonferroni
# # we use a "_" so we don't overwrite the original function
# boot_ci <- function (v, func = median, conf = 0.95, n_iter = 100) {
# # the function we want to run on each iteration i
# boot_func <- \(x, i) func(x[i])
#
# # the boot instance, running the function on each iteration
# b <- boot(v, boot_func, R = n_iter)
# b <- boot.ci(b, conf = conf, type = "perc")
#
# # return just the lower and upper values
# return(c("lower" = b$percent[4],
# "upper" = b$percent[5]))
# }
# df_ci <- energy1 |>
# group_by(full_pop_electricity_access) |>
# summarise(ci_lower = boot_ci(energy_intensity, mean)['lower'],
# mean_price = mean(energy_intensity),
# ci_upper = boot_ci(energy_intensity, mean)['upper'])
#
# df_ci
There is an error here^^^
# df_ci |>
# ggplot() +
# geom_errorbarh(mapping = aes(y = full_pop_electricity_access,
# xmin=ci_lower, xmax=ci_upper,
# color = '95% C.I.'), height = 0.5) +
# geom_point(mapping = aes(x = mean_price, y = energy_intensity,
# color = 'Group Mean'),
# shape = '|',
# size = 5) +
# scale_color_manual(values=c('black', 'orange')) +
# theme_minimal() +
# labs(title = "Energy intensity By Full Population Electricity Access",
# x = "Energy Intensity",
# y = "Full Population Access",
# color = '')
There is not enough data to note if countries with full access to electricity typically have a higher energy intensity value.
Another Example
Instead of energy intensity, I swapped for the total electricity output. Fortunately, this may show us that countries with full electricity access have a larger electricity output, just not a higher energy intensity.
energy1 |>
group_by(full_pop_electricity_access) |>
summarise(mean(total_elec_output, na.rm = TRUE), sd(total_elec_output, na.rm = TRUE))
## # A tibble: 2 × 3
## full_pop_electricity_access mean(total_elec_output, n…¹ sd(total_elec_output…²
## <chr> <dbl> <dbl>
## 1 FALSE 38703. 232227.
## 2 TRUE 167219. 532273.
## # ℹ abbreviated names: ¹`mean(total_elec_output, na.rm = TRUE)`,
## # ²`sd(total_elec_output, na.rm = TRUE)`
Evaluate the fit of a linear regression model on this column of data:
Linear shape?
energy1 |>
ggplot(mapping = aes(x = full_pop_electricity_access, y = total_elec_output)) +
geom_point(size = 2, color = 'darkblue') +
theme_minimal()
## Warning: Removed 1108 rows containing missing values (`geom_point()`).
Linear Regression
# lm_ <- \(x) (1 / 2) * x + 3
#
# energy1 |>
# ggplot(mapping = aes(x = full_pop_electricity_access, y = total_elec_output)) +
# geom_point(size = 2) +
# geom_segment(mapping = aes(xend = full_pop_electricity_access, y = total_elec_output, yend = lm_(full_pop_electricity_access),
# color = 'error'), linewidth = 1) +
# geom_smooth(method = "lm", se = FALSE, color = 'red', linewidth = 0.5) +
# scale_color_manual(values = c('darkblue')) +
# labs(color = '') +
# theme_minimal()