DataDive4_Parimala

#read data file
midwest <- read.csv(file="https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/csv/ggplot2/midwest.csv", header=TRUE, sep=",")
colnames(midwest)

##  [1] "rownames"             "PID"                  "county"              
##  [4] "state"                "area"                 "poptotal"            
##  [7] "popdensity"           "popwhite"             "popblack"            
## [10] "popamerindian"        "popasian"             "popother"            
## [13] "percwhite"            "percblack"            "percamerindan"       
## [16] "percasian"            "percother"            "popadults"           
## [19] "perchsd"              "percollege"           "percprof"            
## [22] "poppovertyknown"      "percpovertyknown"     "percbelowpoverty"    
## [25] "percchildbelowpovert" "percadultpoverty"     "percelderlypoverty"  
## [28] "inmetro"              "category"

#loading libraries
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.1     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(dplyr)

#  6 columns to create sample from
selected_columns <- midwest[, c("popdensity", "state", "popwhite", "popblack", "inmetro", "percollege")]

CREATING SAMPLES

num_samples <- sample(5:10, 1)
subsample_size <- floor(nrow(selected_columns) * 0.5)
sampled_data <- list()

#Generate random samples  with replacement
for (i in 1:num_samples) {
  
  sample_indices <- sample(nrow(selected_columns), size = subsample_size, replace = TRUE)
  sampled_data[[i]] <- selected_columns[sample_indices, ]
  
  #data frame for each subsample
  assign(paste0("sample_", i), sampled_data[[i]])
}

Summary of some samples to compare SAMPLE1

summary(sample_1)

##    popdensity         state              popwhite         popblack     
##  Min.   :  113.5   Length:218         Min.   :  5032   Min.   :     1  
##  1st Qu.:  712.7   Class :character   1st Qu.: 22308   1st Qu.:    44  
##  Median : 1359.4   Mode  :character   Median : 36509   Median :   256  
##  Mean   : 3123.8                      Mean   : 76114   Mean   :  5973  
##  3rd Qu.: 2420.6                      3rd Qu.: 71160   3rd Qu.:  1424  
##  Max.   :39083.3                      Max.   :714905   Max.   :181145  
##     inmetro         percollege    
##  Min.   :0.0000   Min.   : 9.472  
##  1st Qu.:0.0000   1st Qu.:13.863  
##  Median :0.0000   Median :16.521  
##  Mean   :0.3945   Mean   :18.645  
##  3rd Qu.:1.0000   3rd Qu.:21.575  
##  Max.   :1.0000   Max.   :48.079

SAMPLE3

summary(sample_3)

##    popdensity         state              popwhite          popblack        
##  Min.   :  146.3   Length:218         Min.   :   5634   Min.   :      1.0  
##  1st Qu.:  727.1   Class :character   1st Qu.:  21704   1st Qu.:     32.0  
##  Median : 1266.3   Mode  :character   Median :  35901   Median :    286.5  
##  Mean   : 3751.1                      Mean   :  99514   Mean   :  17971.9  
##  3rd Qu.: 2361.0                      3rd Qu.:  72645   3rd Qu.:   1696.0  
##  Max.   :88018.4                      Max.   :3204947   Max.   :1317147.0  
##     inmetro         percollege    
##  Min.   :0.0000   Min.   : 8.742  
##  1st Qu.:0.0000   1st Qu.:14.585  
##  Median :0.0000   Median :17.409  
##  Mean   :0.3532   Mean   :18.968  
##  3rd Qu.:1.0000   3rd Qu.:21.060  
##  Max.   :1.0000   Max.   :42.769

Bar graphs to compare states and their population density:

library(ggplot2)

bar_graphs <- list()

# Create and store bar graphs for each sample
for (i in 1:length(sampled_data)) {
  bar_graph <- ggplot(sampled_data[[i]], aes(x = state, y = popdensity)) +
    geom_bar(stat = "identity") +
    labs(title = paste("Sample", i))  # Customize the title if needed
  
  # Store the bar graph in the list
  bar_graphs[[i]] <- bar_graph
}

# Print or visualize the individual bar graphs
for (i in 1:length(bar_graphs)) {
  print(bar_graphs[[i]])
}

The above results show OH to have high popdensity in almost all samples, can explore sample 2,4,9 to see why IL has high popdensity here.

Statistical Tests (t-test): results provide insights into how different sub-samples compare in terms of the variable “popdensity.”

t_test_results <- lapply(2:num_samples, function(i) {
  t.test(sampled_data[[1]]$popdensity, sampled_data[[i]]$popdensity)
})
t_test_results

## [[1]]
## 
##  Welch Two Sample t-test
## 
## data:  sampled_data[[1]]$popdensity and sampled_data[[i]]$popdensity
## t = 0.063736, df = 369.79, p-value = 0.9492
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1313.827  1401.849
## sample estimates:
## mean of x mean of y 
##  3123.762  3079.751 
## 
## 
## [[2]]
## 
##  Welch Two Sample t-test
## 
## data:  sampled_data[[1]]$popdensity and sampled_data[[i]]$popdensity
## t = -0.79207, df = 331.35, p-value = 0.4289
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -2185.2283   930.6269
## sample estimates:
## mean of x mean of y 
##  3123.762  3751.062 
## 
## 
## [[3]]
## 
##  Welch Two Sample t-test
## 
## data:  sampled_data[[1]]$popdensity and sampled_data[[i]]$popdensity
## t = -0.33216, df = 363.1, p-value = 0.74
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1622.660  1153.707
## sample estimates:
## mean of x mean of y 
##  3123.762  3358.238 
## 
## 
## [[4]]
## 
##  Welch Two Sample t-test
## 
## data:  sampled_data[[1]]$popdensity and sampled_data[[i]]$popdensity
## t = 0.63196, df = 428.64, p-value = 0.5278
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -666.9818 1299.1309
## sample estimates:
## mean of x mean of y 
##  3123.762  2807.687 
## 
## 
## [[5]]
## 
##  Welch Two Sample t-test
## 
## data:  sampled_data[[1]]$popdensity and sampled_data[[i]]$popdensity
## t = 0.1864, df = 397.48, p-value = 0.8522
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1124.203  1359.710
## sample estimates:
## mean of x mean of y 
##  3123.762  3006.008

DataDive4_Parimala

parimala

2023-09-18