knitr::opts_chunk$set(class.source = "foldable")

library(dplyr)

# read in data
data <- read.table("C:/Users/aisli/OneDrive/Documents/classes/STAT525/datasets from textbook/Voting.txt")

# rename columns
colnames(data) <- c('co', 'lat', 'long', 'npop', 'whit', 'blac', 'hisp', 'o65', 'hsed', 'coll', 'inco', 'bush', 'gore', 'brow', 'nade', 'harr', 'hage', 'buch', 'mcre', 'phil', 'moor')

# data set without palm beach - known possible outlier
data_nopalm <- data %>% 
  filter(co != 50) 

# only palm beach data
data_palm <- data %>% 
  filter(co == 50)

# making new column for percentage of gore out of gore and bush
data_nopalm$pergore <- data_nopalm$gore / (data_nopalm$bush + data_nopalm$gore)
data_palm$pergore <- data_palm$gore / (data_palm$bush + data_palm$gore)

# histogram of Y for full data
hist(data_nopalm$pergore, main = "Histogram of Percentage of Gore Votes", xlab = "Percentage", ylab = "Frequency", col = "lightblue", border = "black")

knitr::opts_chunk$set(class.source = "foldable")
# boxplot for Y for full data
boxplot(data_nopalm$pergore, main = "Boxplot of Percentage of Gore Votes", ylab = "Percentage", col = "lightblue", border = "black")

# identifying outlier counties
out <- boxplot.stats(data_nopalm$pergore)$out
which(data_nopalm$pergore %in% c(out))
knitr::opts_chunk$set(class.source = "foldable")
# setting window for size
par(mfrow = c(2, 5))

# histograms for each predictor
for(col_index in 2:11) {
  col_name <- colnames(data_nopalm)[col_index]
  data_col <- data_nopalm[[col_name]]
  hist(data_col, main = paste("Histogram of", col_name), xlab = col_name, col = "lightblue", border = "black")
}

knitr::opts_chunk$set(class.source = "foldable")
# boxplots for each predictor with identified outlier counties
for(col_index in 2:11) {
  col_name <- colnames(data_nopalm)[col_index]
  data_col <- data_nopalm[[col_name]]
  boxplot(data_col, main = paste("Box Plot of", col_name), ylab = col_name, col = "lightblue", border = "black")
  outliers <- boxplot.stats(data_col)$out
  outlier_index <- which(data_col %in% c(outliers))
  print(outliers)
  print(data_nopalm[outlier_index, ])
}

knitr::opts_chunk$set(class.source = "foldable")
# scatterplots for each predictor
for (col_index in 2:11) {
  col_name <- colnames(data_nopalm)[col_index]
  plot(data_nopalm$pergore, data_nopalm[[col_name]], 
       main = paste("Gore Votes vs", col_name), 
       xlab = "Percentage of Gore Votes", 
       ylab = col_name,
       col = "blue") 
}

knitr::opts_chunk$set(class.source = "foldable")
# resetting window size 
par(mfrow = c(1, 1))

# subset with only predictors 
subset_nopalm <- data_nopalm[, 2:11]

# scatterplot matrix
plot(subset_nopalm)

knitr::opts_chunk$set(class.source = "foldable")
# correlation matrix
cor(subset_nopalm)
knitr::opts_chunk$set(class.source = "foldable")
# summary of subset, used to compare to palm beach
summary(subset_nopalm)

# summary of Y variable for full data set, used to compare to plam beach
summary(data_nopalm$pergore)