library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggthemes)
library(ggrepel)
setwd("C:/Users/kaitl/OneDrive/Documents/590_Working")

#update data types of dataframe
energy <- read_delim("./590_FinalData1.csv", delim = ",", col_types = "icciiciiiiiiii")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
energy1 <- energy
energy1[energy1 == '..'] <- NA

Build a linear (or generalized linear) model as you like Use whatever response variable and explanatory variables you prefer Use the tools from previous weeks to diagnose the model Highlight any issues with the model Interpret at least one of the coefficients

Goal 1: Linear model

Hypothesis: When the rural electricity access of a country increases, so does the total_population_electricity_access.

Response (Y): total_population_electricity_access

Explanatory (X): rural_electricity_access

model <- lm(rural_electricity_access ~ total_population_electricity_access , data = energy1, na.action = na.omit)

rsquared <- summary(model)$r.squared

energy1 |> 
  ggplot(mapping = aes(x = rural_electricity_access, 
                       y = total_population_electricity_access)) +
  geom_point() +
  geom_smooth(method = 'lm', color = 'gray', linetype = 'dashed', 
              se = FALSE) +
  geom_smooth(se = FALSE) +
  labs(title = "Rural Electricity Access vs. Total Population Electricity Access",
       subtitle = paste("Linear Fit R-Squared =", round(rsquared, 3))) +
  theme_classic()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 5226 rows containing non-finite values (`stat_smooth()`).
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 5226 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 5226 rows containing missing values (`geom_point()`).

model diagnosis

# energy1$total_population_electricity_access <- factor(energy1$total_population_electricity_access)
# m1 <- lm(energy1$rural_electricity_access ~ energy1$urban_electricity_access, data = energy1) #simple model
# m2 <- lm(energy1$total_population_electricity_access ~ energy1$rural_electricity_access + energy1$urban_electricity_access, data = energy1) #complex model
# 
# m1_pred <- data.frame(r = energy1$rural_electricity_access, r_pred = predict(m1))
# m2_pred <- data.frame(r = energy1$rural_electricity_access, r_pred = predict(m2))
# 
# ggplot(energy1, aes(rural_electricity_access, total_population_electricity_access))+
#   geom_point() +
#   geom_line(data = m1_pred, aes(x = rural_electricity_access, y = r_pred), colour = "red")+
#   geom_line(data = m2_pred, aes(x = rural_electricity_access, y = r_pred), colour = "blue")
#check model coefficients
coef(model)
##                         (Intercept) total_population_electricity_access 
##                          -21.697048                            1.216666

So we see a positive relationship between total_population_electricity_access and rural_electricity_access, suggesting that if more people living in rural parts of a country, it is more likely that the entire country will have full electricity access. It is important to note that a lot of my data was ignored due to NA values, so this could be an incorrect interpretation.