#read data file
midwest <- read.csv(file="https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/csv/ggplot2/midwest.csv", header=TRUE, sep=",")
#colnames(midwest)

loading libraries

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.1     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(ggplot2)

CREATING SAMPLES

selected_columns <- midwest[, c("popdensity", "state", "popwhite", "popblack", "inmetro", "percollege")]

num_samples <- sample(5:10, 1)
subsample_size <- floor(nrow(selected_columns) * 0.5)
sampled_data <- list()

#Generate random samples  with replacement
for (i in 1:num_samples) {
  
  sample_indices <- sample(nrow(selected_columns), size = subsample_size, replace = TRUE)
  sampled_data[[i]] <- selected_columns[sample_indices, ]
  
  #data frame for each subsample
  assign(paste0("sample_", i), sampled_data[[i]])
}

Bar graphs to compare states and their population density:

bar_graphs <- list()

# Create and store bar graphs for each sample
for (i in 1:length(sampled_data)) {
  bar_graph <- ggplot(sampled_data[[i]], aes(x = state, y = popdensity)) +
    geom_bar(stat = "identity") +
    labs(title = paste("Sample", i)) 
  
  # Store the bar graph in the list
  bar_graphs[[i]] <- bar_graph
}

#Visualize the individual bar graphs
for (i in 1:length(bar_graphs)) {
  print(bar_graphs[[i]])
}

Distribution of population by race in different states

pop_race <- list()

for (i in 1:length(sampled_data)) {


  long_data <- gather(sampled_data[[i]], key = "population", value = "count", popwhite, popblack)

# Create a grouped bar chart
  popracegraph <- ggplot(long_data, aes(x = state, y = count, fill = population)) +
    geom_bar(stat = "identity", position = "dodge") +
    labs(title = "Population by Race and State", y = "Population") +
    scale_fill_manual(values = c("popwhite" = "blue", "popblack" = "red")) +
    theme_minimal()
  
  # Store the bar graph in the list
  pop_race[[i]] <- popracegraph

}

#Visualize the individual bar graphs
for (i in 1:length(pop_race)) {
  print(pop_race[[i]])
}

Statistical Tests (t-test): results provide insights into how different sub-samples compare in terms of the variable “popdensity”

t_test_results <- lapply(2:num_samples, function(i) {
  t.test(sampled_data[[1]]$popdensity, sampled_data[[i]]$popdensity)
})
#t_test_results