knitr::opts_chunk$set(class.source = "foldable")
library(dplyr)
# read in data
data <- read.table("C:/Users/aisli/OneDrive/Documents/classes/STAT525/datasets from textbook/Voting.txt")
# rename columns
colnames(data) <- c('co', 'lat', 'long', 'npop', 'whit', 'blac', 'hisp', 'o65', 'hsed', 'coll', 'inco', 'bush', 'gore', 'brow', 'nade', 'harr', 'hage', 'buch', 'mcre', 'phil', 'moor')
# data set without palm beach - known possible outlier
data_nopalm <- data %>%
filter(co != 50)
# only palm beach data
data_palm <- data %>%
filter(co == 50)
# making new column for percentage of gore out of gore and bush
data_nopalm$pergore <- data_nopalm$gore / (data_nopalm$bush + data_nopalm$gore)
data_palm$pergore <- data_palm$gore / (data_palm$bush + data_palm$gore)
# histogram of Y for full data
hist(data_nopalm$pergore, main = "Histogram of Percentage of Gore Votes", xlab = "Percentage", ylab = "Frequency", col = "lightblue", border = "black")

knitr::opts_chunk$set(class.source = "foldable")
# boxplot for Y for full data
boxplot(data_nopalm$pergore, main = "Boxplot of Percentage of Gore Votes", ylab = "Percentage", col = "lightblue", border = "black")

# identifying outlier counties
out <- boxplot.stats(data_nopalm$pergore)$out
which(data_nopalm$pergore %in% c(out))
knitr::opts_chunk$set(class.source = "foldable")
# setting window for size
par(mfrow = c(2, 5))
# histograms for each predictor
for(col_index in 2:11) {
col_name <- colnames(data_nopalm)[col_index]
data_col <- data_nopalm[[col_name]]
hist(data_col, main = paste("Histogram of", col_name), xlab = col_name, col = "lightblue", border = "black")
}

knitr::opts_chunk$set(class.source = "foldable")
# boxplots for each predictor with identified outlier counties
for(col_index in 2:11) {
col_name <- colnames(data_nopalm)[col_index]
data_col <- data_nopalm[[col_name]]
boxplot(data_col, main = paste("Box Plot of", col_name), ylab = col_name, col = "lightblue", border = "black")
outliers <- boxplot.stats(data_col)$out
outlier_index <- which(data_col %in% c(outliers))
print(outliers)
print(data_nopalm[outlier_index, ])
}










knitr::opts_chunk$set(class.source = "foldable")
# scatterplots for each predictor
for (col_index in 2:11) {
col_name <- colnames(data_nopalm)[col_index]
plot(data_nopalm$pergore, data_nopalm[[col_name]],
main = paste("Gore Votes vs", col_name),
xlab = "Percentage of Gore Votes",
ylab = col_name,
col = "blue")
}










knitr::opts_chunk$set(class.source = "foldable")
# resetting window size
par(mfrow = c(1, 1))
# subset with only predictors
subset_nopalm <- data_nopalm[, 2:11]
# scatterplot matrix
plot(subset_nopalm)

knitr::opts_chunk$set(class.source = "foldable")
# correlation matrix
cor(subset_nopalm)
knitr::opts_chunk$set(class.source = "foldable")
# summary of subset, used to compare to palm beach
summary(subset_nopalm)
# summary of Y variable for full data set, used to compare to plam beach
summary(data_nopalm$pergore)