# Load necessary libraries
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(readr)
library(ggthemes)
library(grid)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(stringr)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
# Define electoral votes per state
electoral_votes <- data.frame(
  State = c("AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "FL", "GA", "HI", "ID", 
            "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", "MA", "MI", "MN", 
            "MS", "MO", "MT", "NE", "NV", "NH", "NJ", "NM", "NY", "NC", "ND", 
            "OH", "OK", "OR", "PA", "RI", "SC", "SD", "TN", "TX", "UT", "VT", 
            "VA", "WA", "WV", "WI", "WY", "DC"),
  Electoral_Votes = c(9, 3, 11, 6, 54, 10, 7, 3, 30, 16, 4, 4, 19, 11, 6, 6, 
                      8, 8, 4, 10, 11, 15, 10, 6, 10, 4, 5, 6, 14, 5, 14, 16, 
                      3, 17, 7, 8, 19, 4, 9, 3, 11, 40, 6, 3, 13, 12, 4, 10, 
                      3, 3, 3)
)

# Read the panel data from the CSV files
panel_data <- read_csv("/Users/jaredblack/GitHub/ElectionGPT/data/panel_election_results_state.csv")
## Rows: 1530162 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): Date, Voice, State
## dbl (2): Trial, Result
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
control_panel_data <- read_csv("/Users/jaredblack/GitHub/ElectionGPT/data/panel_control_election_results_state.csv")
## Rows: 827730 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): Date, Voice, State
## dbl (2): Trial, Result
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Combine panel data and control panel data with proper voice handling
combined_data <- bind_rows(
  panel_data %>% 
    mutate(
      Group = "Regular",
      Voice = tolower(Voice)
    ),
  control_panel_data %>% 
    mutate(
      Group = "Control",
      Voice = tolower(Voice)
    )
) %>%
  filter(!is.na(Voice)) %>%
  mutate(FullVoice = ifelse(Group == "Control", 
                            paste("control", Voice), 
                            Voice))

# Calculate daily electoral votes and include trial counts
daily_votes <- combined_data %>%
  left_join(electoral_votes, by = "State") %>%
  group_by(FullVoice, Date, State) %>%
  summarize(
    Win_Proportion = mean(Result == 0, na.rm = TRUE),
    Electoral_Votes = first(Electoral_Votes),
    Trials = n_distinct(Trial),
    .groups = 'drop'
  ) %>%
  mutate(Weighted_Votes = Win_Proportion * Electoral_Votes) %>%
  group_by(Date, FullVoice) %>%
  summarize(
    Daily_Electoral_Votes = sum(Weighted_Votes, na.rm = TRUE),
    Total_Trials = first(Trials),
    .groups = 'drop'
  )

# Calculate summary statistics for tooltip
summary_stats <- daily_votes %>%
  group_by(FullVoice) %>%
  summarize(
    Days = n_distinct(Date),
    Avg_Trials = mean(Total_Trials),
    Total_Days = n(),
    Median_Votes = median(Daily_Electoral_Votes),
    Q1 = quantile(Daily_Electoral_Votes, 0.25),
    Q3 = quantile(Daily_Electoral_Votes, 0.75),
    .groups = 'drop'
  )

# First filter the data and remove outliers
daily_votes_filtered <- daily_votes %>%
  group_by(FullVoice) %>%
  mutate(
    percentile_low = quantile(Daily_Electoral_Votes, 0.01, na.rm = TRUE),
    percentile_high = quantile(Daily_Electoral_Votes, 0.99, na.rm = TRUE)
  ) %>%
  filter(
    Daily_Electoral_Votes >= percentile_low,
    Daily_Electoral_Votes <= percentile_high
  ) %>%
  select(-percentile_low, -percentile_high) %>%
  ungroup() %>%
  # Join with summary stats
  left_join(summary_stats, by = "FullVoice") %>%
  # Create the grouping variables
  mutate(
    IsControl = grepl("control", FullVoice),
    BaseVoice = toupper(gsub("control ", "", FullVoice)),
    VoiceType = ifelse(IsControl, "Control", "Experimental"),
    # Create ordered factor for x-axis
    PlotGroup = factor(paste(BaseVoice, VoiceType),
                       levels = c("MSNBC Experimental", "MSNBC Control",
                                  "FOX Experimental", "FOX Control",
                                  "DIRECT Experimental", "DIRECT Control",
                                  "BBC Experimental", "BBC Control"))
  )

# Create the plot with closer grouping
p <- ggplot(daily_votes_filtered, 
            aes(x = PlotGroup,
                y = Daily_Electoral_Votes,
                fill = IsControl,
                text = paste("Voice:", BaseVoice,
                             "<br>Group:", VoiceType,
                             "<br>Days:", Days,
                             "<br>Avg Trials per Day:", round(Total_Trials, 1),
                             "<br>Median Votes:", round(median(Daily_Electoral_Votes), 1)))) +
  geom_boxplot(width = 0.6,                    # Width of the boxes
               position = position_dodge(0.7),  # Smaller value brings exp/control pairs closer
               alpha = 0.7,
               outlier.shape = 21,
               outlier.alpha = 0.5) +
  geom_hline(yintercept = 270, linetype = "dashed", color = "darkred", size = 1) +
  scale_fill_manual(values = c("FALSE" = "#5AAE61", "TRUE" = "#7FB6D5"),
                    labels = c("FALSE" = "Experimental", "TRUE" = "Control")) +
  # Modified labels to keep "Control"
  scale_x_discrete(labels = function(x) {
    x <- gsub(" Experimental", "", x)  # Remove only "Experimental"
    ifelse(grepl("Control", x), 
           paste0("\n", x),  # Add newline before Control labels
           x)               # Keep original for non-Control
  }) +
  labs(title = "Daily Electoral Votes for Kamala Harris by News Reader",
       subtitle = "Distribution of Daily Results (Excluding Top and Bottom 1%)",
       y = "Daily Electoral Votes for Harris",
       x = "Voice") +
  theme_fivethirtyeight() +
  theme(
    legend.position = "bottom",
    legend.title = element_blank(),
    legend.direction = "horizontal",
    plot.title = element_text(face = "bold", hjust = 0.5, size = 16),
    plot.subtitle = element_text(hjust = 0.5, size = 12),
    axis.title = element_text(face = "bold", size = 12),
    axis.text = element_text(size = 10),
    axis.text.x = element_text(angle = 0, hjust = 0.5),
    plot.margin = margin(10, 10, 10, 10)
  )
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Convert to plotly
interactive_plot <- ggplotly(p, tooltip = "text") %>%
  layout(
    showlegend = TRUE,
    legend = list(
      orientation = "h",
      xanchor = "center",
      x = 0.5,
      y = -0.2
    ),
    annotations = list(
      list(
        x = 0.75,
        y = 272,
        text = "270 Votes to Win",
        showarrow = FALSE,
        font = list(color = "darkred"),
        xanchor = "left"
      )
    ),
    margin = list(b = 100)
  )
## Warning: plotly.js does not (yet) support horizontal legend items 
## You can track progress here: 
## https://github.com/plotly/plotly.js/issues/53
# Add note as HTML
note_html <- paste0(
  "<div style='font-size: 11px; line-height: 1.2; margin-top: 20px;'>",
  "Note: This chart shows the distribution of daily electoral votes for Kamala Harris based on different news sources.<br>",
  "Hover over boxplots to see detailed statistics including number of days and trials.<br>",
  "The boxes show the quartiles and median, while whiskers extend to the most extreme non-outlier values.<br>",
  "Outliers (top and bottom 1% per group) have been removed. The dashed red line marks the 270 votes needed to win.",
  "</div>"
)

# Create HTML widget with note
htmlwidgets::prependContent(
  interactive_plot,
  htmltools::HTML(note_html)
)
Note: This chart shows the distribution of daily electoral votes for Kamala Harris based on different news sources.
Hover over boxplots to see detailed statistics including number of days and trials.
The boxes show the quartiles and median, while whiskers extend to the most extreme non-outlier values.
Outliers (top and bottom 1% per group) have been removed. The dashed red line marks the 270 votes needed to win.