#read data file
midwest <- read.csv(file="https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/csv/ggplot2/midwest.csv", header=TRUE, sep=",")
colnames(midwest)
## [1] "rownames" "PID" "county"
## [4] "state" "area" "poptotal"
## [7] "popdensity" "popwhite" "popblack"
## [10] "popamerindian" "popasian" "popother"
## [13] "percwhite" "percblack" "percamerindan"
## [16] "percasian" "percother" "popadults"
## [19] "perchsd" "percollege" "percprof"
## [22] "poppovertyknown" "percpovertyknown" "percbelowpoverty"
## [25] "percchildbelowpovert" "percadultpoverty" "percelderlypoverty"
## [28] "inmetro" "category"
#loading libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.1 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
# 6 columns to create sample from
selected_columns <- midwest[, c("popdensity", "state", "popwhite", "popblack", "inmetro", "percollege")]
CREATING SAMPLES
num_samples <- sample(5:10, 1)
subsample_size <- floor(nrow(selected_columns) * 0.5)
sampled_data <- list()
#Generate random samples with replacement
for (i in 1:num_samples) {
sample_indices <- sample(nrow(selected_columns), size = subsample_size, replace = TRUE)
sampled_data[[i]] <- selected_columns[sample_indices, ]
#data frame for each subsample
assign(paste0("sample_", i), sampled_data[[i]])
}
Summary of some samples to compare SAMPLE1
summary(sample_1)
## popdensity state popwhite popblack
## Min. : 113.5 Length:218 Min. : 5032 Min. : 1
## 1st Qu.: 712.7 Class :character 1st Qu.: 22308 1st Qu.: 44
## Median : 1359.4 Mode :character Median : 36509 Median : 256
## Mean : 3123.8 Mean : 76114 Mean : 5973
## 3rd Qu.: 2420.6 3rd Qu.: 71160 3rd Qu.: 1424
## Max. :39083.3 Max. :714905 Max. :181145
## inmetro percollege
## Min. :0.0000 Min. : 9.472
## 1st Qu.:0.0000 1st Qu.:13.863
## Median :0.0000 Median :16.521
## Mean :0.3945 Mean :18.645
## 3rd Qu.:1.0000 3rd Qu.:21.575
## Max. :1.0000 Max. :48.079
SAMPLE3
summary(sample_3)
## popdensity state popwhite popblack
## Min. : 146.3 Length:218 Min. : 5634 Min. : 1.0
## 1st Qu.: 727.1 Class :character 1st Qu.: 21704 1st Qu.: 32.0
## Median : 1266.3 Mode :character Median : 35901 Median : 286.5
## Mean : 3751.1 Mean : 99514 Mean : 17971.9
## 3rd Qu.: 2361.0 3rd Qu.: 72645 3rd Qu.: 1696.0
## Max. :88018.4 Max. :3204947 Max. :1317147.0
## inmetro percollege
## Min. :0.0000 Min. : 8.742
## 1st Qu.:0.0000 1st Qu.:14.585
## Median :0.0000 Median :17.409
## Mean :0.3532 Mean :18.968
## 3rd Qu.:1.0000 3rd Qu.:21.060
## Max. :1.0000 Max. :42.769
Bar graphs to compare states and their population density:
library(ggplot2)
bar_graphs <- list()
# Create and store bar graphs for each sample
for (i in 1:length(sampled_data)) {
bar_graph <- ggplot(sampled_data[[i]], aes(x = state, y = popdensity)) +
geom_bar(stat = "identity") +
labs(title = paste("Sample", i)) # Customize the title if needed
# Store the bar graph in the list
bar_graphs[[i]] <- bar_graph
}
# Print or visualize the individual bar graphs
for (i in 1:length(bar_graphs)) {
print(bar_graphs[[i]])
}
The above results show OH to have high popdensity in almost all samples,
can explore sample 2,4,9 to see why IL has high popdensity here.
Statistical Tests (t-test): results provide insights into how different sub-samples compare in terms of the variable “popdensity.”
t_test_results <- lapply(2:num_samples, function(i) {
t.test(sampled_data[[1]]$popdensity, sampled_data[[i]]$popdensity)
})
t_test_results
## [[1]]
##
## Welch Two Sample t-test
##
## data: sampled_data[[1]]$popdensity and sampled_data[[i]]$popdensity
## t = 0.063736, df = 369.79, p-value = 0.9492
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1313.827 1401.849
## sample estimates:
## mean of x mean of y
## 3123.762 3079.751
##
##
## [[2]]
##
## Welch Two Sample t-test
##
## data: sampled_data[[1]]$popdensity and sampled_data[[i]]$popdensity
## t = -0.79207, df = 331.35, p-value = 0.4289
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -2185.2283 930.6269
## sample estimates:
## mean of x mean of y
## 3123.762 3751.062
##
##
## [[3]]
##
## Welch Two Sample t-test
##
## data: sampled_data[[1]]$popdensity and sampled_data[[i]]$popdensity
## t = -0.33216, df = 363.1, p-value = 0.74
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1622.660 1153.707
## sample estimates:
## mean of x mean of y
## 3123.762 3358.238
##
##
## [[4]]
##
## Welch Two Sample t-test
##
## data: sampled_data[[1]]$popdensity and sampled_data[[i]]$popdensity
## t = 0.63196, df = 428.64, p-value = 0.5278
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -666.9818 1299.1309
## sample estimates:
## mean of x mean of y
## 3123.762 2807.687
##
##
## [[5]]
##
## Welch Two Sample t-test
##
## data: sampled_data[[1]]$popdensity and sampled_data[[i]]$popdensity
## t = 0.1864, df = 397.48, p-value = 0.8522
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1124.203 1359.710
## sample estimates:
## mean of x mean of y
## 3123.762 3006.008