library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6      ✔ purrr   0.3.4 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.2      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
htd <- read_csv("C:\\Users\\moore\\OneDrive\\Desktop\\Fall 2023\\Intro to statistics\\project\\Statistics Project\\Statistics Project\\Human Trafficking data.csv")
## Rows: 3098 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (14): ORI, PUB_AGENCY_NAME, PUB_AGENCY_UNIT, AGENCY_TYPE_NAME, STATE_ABB...
## dbl  (5): DATA_YEAR, ACTUAL_COUNT, UNFOUNDED_COUNT, CLEARED_COUNT, JUVENILE_...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

avg_count_st <- htd |>
  group_by(STATE_ABBR) |>
  summarise(mean_count = mean(ACTUAL_COUNT))
avg_count_st
## # A tibble: 50 × 2
##    STATE_ABBR mean_count
##    <chr>           <dbl>
##  1 AK              112. 
##  2 AL               99.9
##  3 AR               81.8
##  4 AZ              421. 
##  5 CO              191. 
##  6 CT               82.2
##  7 DE              151. 
##  8 FL              244. 
##  9 FS               16  
## 10 GA              192. 
## # … with 40 more rows
min(avg_count_st$mean_count)
## [1] 16
median(avg_count_st$mean_count)
## [1] 127.4348
max(avg_count_st$mean_count)
## [1] 2302.515
mean(avg_count_st$mean_count)
## [1] 209.9257
prob <- function(n){
  probs <- sample(avg_count_st$mean_count,n,replace = FALSE)
  probs
}


#probability a state has a higher average actual count greater then or equal to 120
st_higher_mean <- sum(prob(50) >= mean(avg_count_st$mean_count))
cat("probability a state has a higher average actual count greater then or equal to the mean:", st_higher_mean/50.)
## probability a state has a higher average actual count greater then or equal to the mean: 0.24
st_lower <- sum(prob(50) <= 100)
cat("probability a state has a lower average actual count less then or equal to 100:", st_lower/50)
## probability a state has a lower average actual count less then or equal to 100: 0.34
st_higher_median <- sum(prob(50) >= median(avg_count_st$mean_count))
cat("probability a state has a higher average actual count greater then or equal to the median:", st_higher_median/50)
## probability a state has a higher average actual count greater then or equal to the median: 0.5
count_st <- htd |>
  group_by(STATE_ABBR) |>
  summarize(ACTUAL_COUNT)
## `summarise()` has grouped output by 'STATE_ABBR'. You can override using the
## `.groups` argument.
higher_mean <- function(n){
  probs <- sample(count_st$ACTUAL_COUNT,n,replace = TRUE)
  probs
}
length(count_st$ACTUAL_COUNT)
## [1] 3098
prob_higher_mean <- sum(higher_mean(3098) >= mean(count_st$ACTUAL_COUNT))/3098
prob_higher_median <- sum(higher_mean(3098) >= median(count_st$ACTUAL_COUNT))/3098
cat("probability a state has a higher actual count greater then or equal to the mean:", prob_higher_mean)
## probability a state has a higher actual count greater then or equal to the mean: 0.1823757
cat("probability a state has a higher actual count greater then or equal to the median:", prob_higher_median)
## probability a state has a higher actual count greater then or equal to the median: 0.7872821
rolls <- seq(1, 100, length.out = 100)
prob <- function(n){
  probs <- sample(count_st$ACTUAL_COUNT,n,replace = FALSE)
  probs
}


proportions <- sapply(rolls, function(n) {
  counts <- prob(n)
  sum(counts >= mean(count_st$ACTUAL_COUNT))
})


result_data <- data.frame(Number_of_Trials = rolls, Proportion_Higher = proportions)


result_data |>
  ggplot() +
  geom_line(mapping = aes(x = Number_of_Trials, y = Proportion_Higher,
                          colour = "Stay")) +
  scale_color_brewer(palette = "Dark2") +
  theme_classic() +
  labs(
    title = "Proportions higher than the mean",
    x = "Number of Simulations",
    y = "Count of Proportions Higher than Mean",
    colour = "Strategy"
  )

The visualization illustrates that as the number of simulations (Number_of_Trials) increases the proportion of states with counts higher than the mean count stabilizes around a certain probability. This suggests that based on the data frame there is a consistent estimated probability that a randomly selected state will have a count higher than the mean count. The stable value represents this estimated probability providing valuable insights into the likelihood of states exceeding the mean count.

counts_region <- htd |> 
  group_by(REGION_NAME, OFFENSE_SUBCAT_NAME) |>
  summarize(ACTUAL_COUNT)
## `summarise()` has grouped output by 'REGION_NAME', 'OFFENSE_SUBCAT_NAME'. You
## can override using the `.groups` argument.
prob_region <- function(n){
  probs <- sample(counts_region$ACTUAL_COUNT,n,replace = FALSE)
  probs
}

prob_region_offense <- function(n){
  probs <- sample(counts_region$OFFENSE_SUBCAT_NAME,n,replace = FALSE)
  probs
}
mean_commercial_sex_acts <- mean(counts_region$ACTUAL_COUNT[counts_region$OFFENSE_SUBCAT_NAME == "Commercial Sex Acts"])
mean_involuntary_servitude <- mean(counts_region$ACTUAL_COUNT[counts_region$OFFENSE_SUBCAT_NAME  == "Involuntary Servitude"])
ggplot(counts_region, aes(x = REGION_NAME, y = ACTUAL_COUNT, fill = OFFENSE_SUBCAT_NAME)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Average Actual Counts by Region and Offense Subcategory",
       x = "Region", y = "Average Actual Counts") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  scale_fill_brewer(palette = "Set3")  

From this graph I can conclude commercial sex acts are more frequent in the west region of United States in this dataset. As well the graph shows involuntary servitude is more frequent in the south region of the united states. From this I can find the probaility that commercial sex acts are more frequent than involuntary servitude in all the regions.

n_simulations <- 10  
probability_commercial <- sum(prob_region_offense(n_simulations) == "Commercial Sex Acts") / n_simulations
probability_involuntary <- sum(prob_region_offense(n_simulations) == "Involuntary Servitude") / n_simulations

prob_data_region <- data.frame(Event = c("Offense Subcategory == 'Commerical Sex Acts",
                             "Offense Subcategory == 'Involuntary Servitude'"),
                   Probability = c(probability_commercial, probability_involuntary))

ggplot(prob_data_region, aes(x = Event, y = Probability, fill = Event)) +
  geom_bar(stat = "identity") +
  labs(title = "Probability of commercial sex acts appearing more",
       x = "Event", y = "Probability") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

This visualization shows the probability of commercial sex acts appearing more in each region from the counts_region data frame. Based on this I can conclude not only are there more commercial sex acts but for each region it has a higher count than involutary servitude. As well with each trial it consistently has a higher probability.

combinations <- htd |>
  group_by(POPULATION_GROUP_DESC, OFFENSE_SUBCAT_NAME) |>
  count()

ggplot(data = combinations) +
  geom_bar(aes(x = POPULATION_GROUP_DESC, y = n, fill = factor(OFFENSE_SUBCAT_NAME, labels = c("Commercial Sex Acts", "Involuntary Servitude"))), stat = "identity", position = "dodge") +
  labs(
    x = "Population Group Description",
    y = "Count",
    title = "Comparison of Most and Least Common Combinations by Offense Subcategory"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  scale_fill_manual(
    values = c("blue", "red"),
    name = "Offense Subcategory"
  ) +
  guides(fill = guide_legend(title = "Offense Subcategory"))

This graph visualizes the combination of population group descriptions with the two offenses (commercial sex acts and involuntary servitude) and which ones appear the least and most.Earlier since I already knew that there is more cases of commercial sex acts than involuntary servitude I wanted to find which population group description has the highest amount of commercial sex acts. As well with involuntary servitude I can see which combinations are more frequent. My hypothesis is cities with larger population will experience more offenses. This graph displays the accuracy of the hypothesis and turns out to be incorrect. The two larger demographics, “cities from 250000 thru 499999” and “cities from 500000 thru 999999” do not have the most combinations of the two offenses. From this I can do further research as to what states these cities align with.