library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
htd <- read.csv("C:\\Users\\moore\\OneDrive\\Desktop\\Fall 2023\\Intro to statistics\\project\\Statistics Project\\Statistics Project\\Human Trafficking data.csv")
head(htd)
## DATA_YEAR ORI PUB_AGENCY_NAME PUB_AGENCY_UNIT
## 1 2013 MO0950000 St. Louis County Police Department
## 2 2013 OH0020000 Allen
## 3 2013 OH0020000 Allen
## 4 2013 TN0320100 Morristown
## 5 2013 TNMPD0000 Memphis
## 6 2013 WA0173600 Federal Way
## AGENCY_TYPE_NAME STATE_ABBR STATE_NAME DIVISION_NAME COUNTY_NAME
## 1 County MO Missouri West North Central ST LOUIS
## 2 County OH Ohio East North Central ALLEN
## 3 County OH Ohio East North Central ALLEN
## 4 City TN Tennessee East South Central HAMBLEN, JEFFERSON
## 5 City TN Tennessee East South Central SHELBY
## 6 City WA Washington Pacific KING
## REGION_NAME POPULATION_GROUP_CODE POPULATION_GROUP_DESC
## 1 Midwest 9A MSA counties 100,000 or over
## 2 Midwest 9C MSA counties from 10,000 thru 24,999
## 3 Midwest 9C MSA counties from 10,000 thru 24,999
## 4 South 4 Cities from 25,000 thru 49,999
## 5 South 1B Cities from 500,000 thru 999,999
## 6 West 3 Cities from 50,000 thru 99,999
## OFFENSE_SUBCAT_ID OFFENSE_NAME OFFENSE_SUBCAT_NAME ACTUAL_COUNT
## 1 HTCSA Human Trafficking Commercial Sex Acts 384
## 2 HTCSA Human Trafficking Commercial Sex Acts 64
## 3 HTIS Human Trafficking Involuntary Servitude 64
## 4 HTCSA Human Trafficking Commercial Sex Acts 64
## 5 HTCSA Human Trafficking Commercial Sex Acts 192
## 6 HTCSA Human Trafficking Commercial Sex Acts 64
## UNFOUNDED_COUNT CLEARED_COUNT JUVENILE_CLEARED_COUNT
## 1 0 384 0
## 2 0 64 0
## 3 0 0 0
## 4 0 64 0
## 5 0 128 0
## 6 0 64 0
samples <- htd |> select(DATA_YEAR, AGENCY_TYPE_NAME, STATE_ABBR, REGION_NAME, POPULATION_GROUP_DESC, OFFENSE_SUBCAT_NAME, ACTUAL_COUNT, CLEARED_COUNT)
total_rows <- nrow(htd)
samples_size <- round(0.5 * total_rows)
sampled_data_frames <- list()
for (i in 1:10) {
random_sample <- samples |>
sample_n(samples_size, replace = TRUE)
sampled_data_frames[[i]] <- random_sample
assign(paste0("df_", i), random_sample)
}
for (i in 1:10) {
cat("df_", i, ":\n")
print(head(sampled_data_frames[[i]]))
}
## df_ 1 :
## DATA_YEAR AGENCY_TYPE_NAME STATE_ABBR REGION_NAME
## 1 2015 City TX South
## 2 2018 City MN Midwest
## 3 2021 City NC South
## 4 2015 City SC South
## 5 2019 City WI Midwest
## 6 2017 County TN South
## POPULATION_GROUP_DESC OFFENSE_SUBCAT_NAME ACTUAL_COUNT
## 1 Cities from 25,000 thru 49,999 Involuntary Servitude 64
## 2 Cities from 50,000 thru 99,999 Commercial Sex Acts 256
## 3 Cities from 500,000 thru 999,999 Involuntary Servitude 768
## 4 Cities from 2,500 thru 9,999 Involuntary Servitude 51
## 5 Cities from 2,500 thru 9,999 Commercial Sex Acts 32
## 6 Non-MSA counties from 10,000 thru 24,999 Commercial Sex Acts 64
## CLEARED_COUNT
## 1 128
## 2 64
## 3 512
## 4 0
## 5 32
## 6 0
## df_ 2 :
## DATA_YEAR AGENCY_TYPE_NAME STATE_ABBR REGION_NAME
## 1 2021 County KY South
## 2 2014 City TX South
## 3 2021 State Police WV South
## 4 2021 City MN Midwest
## 5 2020 County UT West
## 6 2014 City IL Midwest
## POPULATION_GROUP_DESC OFFENSE_SUBCAT_NAME ACTUAL_COUNT
## 1 Non-MSA counties from 10,000 thru 24,999 Commercial Sex Acts 128
## 2 Cities from 10,000 thru 24,999 Involuntary Servitude 64
## 3 MSA counties under 10,000 Commercial Sex Acts 25
## 4 Cities from 100,000 thru 249,999 Commercial Sex Acts 448
## 5 MSA counties from 10,000 thru 24,999 Involuntary Servitude 128
## 6 Cities from 10,000 thru 24,999 Involuntary Servitude 53
## CLEARED_COUNT
## 1 0
## 2 64
## 3 0
## 4 0
## 5 0
## 6 0
## df_ 3 :
## DATA_YEAR AGENCY_TYPE_NAME STATE_ABBR REGION_NAME
## 1 2015 State Police MI Midwest
## 2 2019 City NC South
## 3 2015 City TX South
## 4 2020 City HI West
## 5 2018 County FL South
## 6 2021 State Police VA South
## POPULATION_GROUP_DESC OFFENSE_SUBCAT_NAME ACTUAL_COUNT
## 1 MSA counties under 10,000 Commercial Sex Acts 135
## 2 Cities from 10,000 thru 24,999 Commercial Sex Acts 256
## 3 Cities from 10,000 thru 24,999 Commercial Sex Acts 48
## 4 Cities from 500,000 thru 999,999 Commercial Sex Acts 704
## 5 MSA counties 100,000 or over Commercial Sex Acts 64
## 6 MSA counties under 10,000 Commercial Sex Acts 52
## CLEARED_COUNT
## 1 135
## 2 192
## 3 0
## 4 320
## 5 0
## 6 0
## df_ 4 :
## DATA_YEAR AGENCY_TYPE_NAME STATE_ABBR REGION_NAME
## 1 2021 City TX South
## 2 2020 City MI Midwest
## 3 2020 County TN South
## 4 2020 County MO Midwest
## 5 2017 City IL Midwest
## 6 2019 City CO West
## POPULATION_GROUP_DESC OFFENSE_SUBCAT_NAME ACTUAL_COUNT
## 1 Cities from 100,000 thru 249,999 Commercial Sex Acts 512
## 2 Cities from 50,000 thru 99,999 Commercial Sex Acts 64
## 3 MSA counties from 25,000 thru 99,999 Commercial Sex Acts 384
## 4 Non-MSA counties under 10,000 Commercial Sex Acts 64
## 5 Cities from 25,000 thru 49,999 Commercial Sex Acts 64
## 6 Cities from 100,000 thru 249,999 Commercial Sex Acts 66
## CLEARED_COUNT
## 1 192
## 2 0
## 3 192
## 4 0
## 5 0
## 6 66
## df_ 5 :
## DATA_YEAR AGENCY_TYPE_NAME STATE_ABBR REGION_NAME
## 1 2019 City GA South
## 2 2020 State Police VA South
## 3 2020 County GA South
## 4 2018 City MA Northeast
## 5 2014 County FL South
## 6 2017 County CO West
## POPULATION_GROUP_DESC OFFENSE_SUBCAT_NAME ACTUAL_COUNT
## 1 Cities from 10,000 thru 24,999 Commercial Sex Acts 64
## 2 Non-MSA counties under 10,000 Involuntary Servitude 52
## 3 MSA counties from 25,000 thru 99,999 Commercial Sex Acts 64
## 4 Cities from 25,000 thru 49,999 Commercial Sex Acts 64
## 5 MSA counties 100,000 or over Commercial Sex Acts 320
## 6 MSA counties from 25,000 thru 99,999 Involuntary Servitude 64
## CLEARED_COUNT
## 1 0
## 2 0
## 3 0
## 4 64
## 5 320
## 6 0
## df_ 6 :
## DATA_YEAR AGENCY_TYPE_NAME STATE_ABBR REGION_NAME
## 1 2018 City MA Northeast
## 2 2021 County TX South
## 3 2019 City AL South
## 4 2014 City TX South
## 5 2021 City KY South
## 6 2020 City GA South
## POPULATION_GROUP_DESC OFFENSE_SUBCAT_NAME ACTUAL_COUNT
## 1 Cities from 2,500 thru 9,999 Commercial Sex Acts 62
## 2 MSA counties from 25,000 thru 99,999 Commercial Sex Acts 64
## 3 Cities from 2,500 thru 9,999 Commercial Sex Acts 48
## 4 Cities from 250,000 thru 499,999 Involuntary Servitude 128
## 5 Cities from 2,500 thru 9,999 Involuntary Servitude 34
## 6 Cities from 50,000 thru 99,999 Commercial Sex Acts 1536
## CLEARED_COUNT
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 768
## df_ 7 :
## DATA_YEAR AGENCY_TYPE_NAME STATE_ABBR REGION_NAME
## 1 2017 City TX South
## 2 2017 City KY South
## 3 2021 City LA South
## 4 2021 City ME Northeast
## 5 2016 City MN Midwest
## 6 2019 County GA South
## POPULATION_GROUP_DESC OFFENSE_SUBCAT_NAME ACTUAL_COUNT
## 1 Cities from 25,000 thru 49,999 Involuntary Servitude 64
## 2 Cities from 250,000 thru 499,999 Commercial Sex Acts 128
## 3 Cities from 10,000 thru 24,999 Commercial Sex Acts 64
## 4 Cities from 25,000 thru 49,999 Commercial Sex Acts 64
## 5 Cities from 10,000 thru 24,999 Commercial Sex Acts 64
## 6 MSA counties from 25,000 thru 99,999 Commercial Sex Acts 704
## CLEARED_COUNT
## 1 0
## 2 64
## 3 64
## 4 0
## 5 64
## 6 128
## df_ 8 :
## DATA_YEAR AGENCY_TYPE_NAME STATE_ABBR REGION_NAME
## 1 2021 State Police WV South
## 2 2019 City TX South
## 3 2019 State Police WV South
## 4 2019 City KY South
## 5 2020 County OR West
## 6 2018 County ND Midwest
## POPULATION_GROUP_DESC OFFENSE_SUBCAT_NAME ACTUAL_COUNT
## 1 MSA counties under 10,000 Commercial Sex Acts 25
## 2 Cities from 100,000 thru 249,999 Commercial Sex Acts 1536
## 3 Non-MSA counties under 10,000 Commercial Sex Acts 25
## 4 Cities from 2,500 thru 9,999 Commercial Sex Acts 64
## 5 MSA counties 100,000 or over Involuntary Servitude 64
## 6 Non-MSA counties under 10,000 Commercial Sex Acts 64
## CLEARED_COUNT
## 1 25
## 2 320
## 3 0
## 4 64
## 5 0
## 6 0
## df_ 9 :
## DATA_YEAR AGENCY_TYPE_NAME STATE_ABBR REGION_NAME
## 1 2016 County CO West
## 2 2019 County CO West
## 3 2019 County IN Midwest
## 4 2020 County MO Midwest
## 5 2019 City NC South
## 6 2019 City MA Northeast
## POPULATION_GROUP_DESC OFFENSE_SUBCAT_NAME ACTUAL_COUNT
## 1 MSA counties from 25,000 thru 99,999 Involuntary Servitude 128
## 2 MSA counties 100,000 or over Commercial Sex Acts 128
## 3 MSA counties from 25,000 thru 99,999 Commercial Sex Acts 64
## 4 Non-MSA counties from 25,000 thru 99,999 Commercial Sex Acts 64
## 5 Cities from 50,000 thru 99,999 Commercial Sex Acts 128
## 6 Cities from 25,000 thru 49,999 Commercial Sex Acts 64
## CLEARED_COUNT
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
## df_ 10 :
## DATA_YEAR AGENCY_TYPE_NAME STATE_ABBR REGION_NAME
## 1 2020 City MA Northeast
## 2 2018 City CT Northeast
## 3 2016 City MI Midwest
## 4 2019 City NC South
## 5 2016 City MI Midwest
## 6 2021 City MA Northeast
## POPULATION_GROUP_DESC OFFENSE_SUBCAT_NAME ACTUAL_COUNT
## 1 Cities from 50,000 thru 99,999 Involuntary Servitude 64
## 2 Cities from 25,000 thru 49,999 Commercial Sex Acts 128
## 3 Cities from 50,000 thru 99,999 Involuntary Servitude 128
## 4 Cities from 50,000 thru 99,999 Involuntary Servitude 128
## 5 Cities from 50,000 thru 99,999 Commercial Sex Acts 576
## 6 Cities from 10,000 thru 24,999 Commercial Sex Acts 62
## CLEARED_COUNT
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
data_frames <- list(df_1, df_2, df_3, df_4, df_5, df_6, df_7, df_8, df_9, df_10)
plot_titles <- c("df_1", "df_2", "df_3", "df_4", "df_5", "df_6", "df_7", "df_8", "df_9", "df_10")
colors <- c("red", "blue", "green", "purple", "orange", "pink", "brown", "cyan", "magenta", "gray")
for (i in 1:length(data_frames)) {
p <- ggplot(data_frames[[i]], aes(x = ACTUAL_COUNT)) +
geom_histogram(fill = colors[i], bins = 20) +
labs(title = plot_titles[i], x = "Values", y = "Count") +
theme_minimal()
print(p)
}
The histograms show each dataframe is heavily skewed to the left from
the actual count column.
for (i in 1:length(data_frames)) {
p <- ggplot(data_frames[[i]], aes(x = REGION_NAME)) +
geom_histogram(fill = colors[i], bins = 20, stat = "count") +
labs(title = plot_titles[i], x = "Values", y = "Count") +
theme_minimal()
print(p)
}
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Ignoring unknown parameters: binwidth, bins, pad
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Ignoring unknown parameters: binwidth, bins, pad
for (i in 1:length(data_frames)) {
index <- seq_along(data_frames[[i]]$ACTUAL_COUNT)
p <- plot(index, data_frames[[i]]$ACTUAL_COUNT,
main = plot_titles[i], xlab = "Index", ylab = "ACTUAL_COUNT", col = colors[i], pch = 19)
# Add a legend to identify data frames
legend("topright", legend = plot_titles[i], col = colors[i], pch = 19, cex = 0.8)
}
for (i in 1:length(data_frames)) {
p <- ggplot(data_frames[[i]], aes(x = DATA_YEAR)) +
geom_histogram(fill = colors[i], bins = 20) +
labs(title = plot_titles[i], x = "Values", y = "Count") +
theme_minimal()
print(p)
}
for (i in 1:length(data_frames)) {
p <- ggplot(data_frames[[i]], aes(x = STATE_ABBR)) +
geom_histogram(fill = colors[i], bins = 20, stat = "count") +
labs(title = plot_titles[i], x = "Values", y = "Count") +
theme_minimal()
print(p)
}
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Ignoring unknown parameters: binwidth, bins, pad
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Ignoring unknown parameters: binwidth, bins, pad
for (i in 1:length(data_frames)) {
p <- ggplot(data_frames[[i]], aes(x = AGENCY_TYPE_NAME)) +
geom_histogram(fill = colors[i], bins = 20, stat = "count") +
labs(title = plot_titles[i], x = "Values", y = "Count") +
theme_minimal()
print(p)
}
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Ignoring unknown parameters: binwidth, bins, pad
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Ignoring unknown parameters: binwidth, bins, pad
Based on the visual analysis of the data frames generated from the sampling process, there appear to be no notable differences or anomalies detected among them. The distributions of the selected columns, which were consistently sampled across all data frames, show a high degree of consistency and similarity. This suggests that the random sampling method did not introduce significant variations or anomalies into the data frames and they closely resemble one another in terms of their data distributions. Considering this it’s important to review and refine the sampling procedure to ensure that future investigations can capture the full range of variability in the data. This might involve adjusting the sampling strategy or exploring alternative methods to better represent the underlying population.
When creating a scatterplot for the ACTUAL_COUNTS column this shows more variability as the index changes. As each data frame sample is different from the other. As well as the index changes you start to see each dataframe has more outliers than the next but still displays a common occurrence of low variability among the samples. This can suggest that increasing the sample size may draw out more outliters and variability for future methods conducted on the data.