Final-Project-1.knit

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6      ✔ purrr   0.3.4 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.2      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

library(ggthemes)

## Warning: package 'ggthemes' was built under R version 4.2.3

library(ggrepel)

# time series toolkits
library(xts)

## Warning: package 'xts' was built under R version 4.2.3

## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## 
## ######################### Warning from 'xts' package ##########################
## #                                                                             #
## # The dplyr lag() function breaks how base R's lag() function is supposed to  #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
## # source() into this session won't work correctly.                            #
## #                                                                             #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
## # dplyr from breaking base R's lag() function.                                #
## #                                                                             #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
## #                                                                             #
## ###############################################################################
## 
## Attaching package: 'xts'
## 
## The following objects are masked from 'package:dplyr':
## 
##     first, last

library(tsibble)

## Warning: package 'tsibble' was built under R version 4.2.3

## 
## Attaching package: 'tsibble'
## 
## The following object is masked from 'package:zoo':
## 
##     index
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, union

library(lubridate)

## 
## Attaching package: 'lubridate'
## 
## The following object is masked from 'package:tsibble':
## 
##     interval
## 
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

htd <- read.csv("C:\\Users\\moore\\OneDrive\\Desktop\\Fall 2023\\Intro to statistics\\project\\Statistics Project\\Statistics Project\\htd_transformed.csv")

anova <- aov(CLEARED_COUNT ~ ACTUAL_COUNT, data = htd)
summary(anova)

##                Df    Sum Sq   Mean Sq F value Pr(>F)    
## ACTUAL_COUNT    1 256436013 256436013    7635 <2e-16 ***
## Residuals    3096 103980167     33585                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

anova2 <-aov(JUVENILE_CLEARED_COUNT ~ ACTUAL_COUNT, data = htd)
summary(anova2)

##                Df   Sum Sq Mean Sq F value Pr(>F)    
## ACTUAL_COUNT    1   688205  688205   187.6 <2e-16 ***
## Residuals    3096 11360394    3669                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

htd |>
  ggplot(aes(x = CLEARED_COUNT, y = ACTUAL_COUNT, color = REGION_NAME)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  labs(title = "Scatter Plot and Regression Lines",
       subtitle = "",
       x = "CLEARED_COUNT", y = "ACTUAL_COUNT") +
  theme_minimal()

## `geom_smooth()` using formula 'y ~ x'

htd |>
  ggplot(aes(x = JUVENILE_CLEARED_COUNT, y = ACTUAL_COUNT, color = REGION_NAME)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  labs(title = "Scatter Plot and Regression Lines",
       subtitle = "",
       x = "Juvenile clear counts", y = "Actual counts") +
  theme_minimal()

## `geom_smooth()` using formula 'y ~ x'

htd_1 <- htd |>
  select(DATA_YEAR, REGION_NAME, ACTUAL_COUNT, CLEARED_COUNT, JUVENILE_CLEARED_COUNT) |>
  distinct()
htd_1 <- htd_1|>
  group_by(DATA_YEAR, REGION_NAME) |>
  summarise(SUM_ACTUAL_COUNT = sum(ACTUAL_COUNT),
            SUM_JUVENILE_CLEARED_COUNT = sum(JUVENILE_CLEARED_COUNT),
            SUM_CLEARED_COUNT = sum(CLEARED_COUNT))

## `summarise()` has grouped output by 'DATA_YEAR'. You can override using the
## `.groups` argument.

htd_ts1 <- htd_1 |>
  as_tibble(index = DATA_YEAR, key = c("SUM_ACTUAL_COUNT", "SUM_JUVENILE_CLEARED_COUNT", "SUM_CLEARED_COUNT"))
fit_linear_models <- function(region_name) {
  region_data <- htd_ts1 %>% filter(REGION_NAME == region_name)

  # Linear model for actual counts
  actual_counts_model <- lm(SUM_ACTUAL_COUNT ~ DATA_YEAR, data = region_data)
  print(paste("Linear model for actual counts in", region_name))
  print(summary(actual_counts_model))

  # Linear model for cleared counts
  cleared_counts_model <- lm(SUM_CLEARED_COUNT ~ DATA_YEAR, data = region_data)
  print(paste("Linear model for cleared counts in", region_name))
  print(summary(cleared_counts_model))

  # Linear model for juvenile cleared counts
  juvenile_cleared_counts_model <- lm(SUM_JUVENILE_CLEARED_COUNT ~ DATA_YEAR, data = region_data)
  print(paste("Linear model for juvenile cleared counts in", region_name))
  print(summary(juvenile_cleared_counts_model))
}

# Apply the function to each region
fit_linear_models("Midwest")

## [1] "Linear model for actual counts in Midwest"
## 
## Call:
## lm(formula = SUM_ACTUAL_COUNT ~ DATA_YEAR, data = region_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4917.9 -2586.1  -869.9  2616.9  6867.0 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)  
## (Intercept) -3378490.2  1108880.4  -3.047   0.0187 *
## DATA_YEAR       1681.0      549.8   3.058   0.0184 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4258 on 7 degrees of freedom
## Multiple R-squared:  0.5719, Adjusted R-squared:  0.5107 
## F-statistic:  9.35 on 1 and 7 DF,  p-value: 0.01838
## 
## [1] "Linear model for cleared counts in Midwest"
## 
## Call:
## lm(formula = SUM_CLEARED_COUNT ~ DATA_YEAR, data = region_data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -4409  -3226   -835   3016   5803 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1570460.9  1056457.9  -1.487    0.181
## DATA_YEAR        782.0      523.8   1.493    0.179
## 
## Residual standard error: 4057 on 7 degrees of freedom
## Multiple R-squared:  0.2415, Adjusted R-squared:  0.1332 
## F-statistic: 2.229 on 1 and 7 DF,  p-value: 0.1791
## 
## [1] "Linear model for juvenile cleared counts in Midwest"
## 
## Call:
## lm(formula = SUM_JUVENILE_CLEARED_COUNT ~ DATA_YEAR, data = region_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -63.044 -32.411  -8.578  45.889  72.656 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)
## (Intercept) -9532.256  14211.198  -0.671    0.524
## DATA_YEAR       4.767      7.046   0.677    0.520
## 
## Residual standard error: 54.58 on 7 degrees of freedom
## Multiple R-squared:  0.06137,    Adjusted R-squared:  -0.07272 
## F-statistic: 0.4577 on 1 and 7 DF,  p-value: 0.5204

fit_linear_models("West")

## [1] "Linear model for actual counts in West"
## 
## Call:
## lm(formula = SUM_ACTUAL_COUNT ~ DATA_YEAR, data = region_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4736.2 -2430.3  -274.2  3264.7  6140.6 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -7288967.4  1075766.1  -6.776 0.000259 ***
## DATA_YEAR       3622.0      533.3   6.791 0.000255 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4131 on 7 degrees of freedom
## Multiple R-squared:  0.8682, Adjusted R-squared:  0.8494 
## F-statistic: 46.12 on 1 and 7 DF,  p-value: 0.0002553
## 
## [1] "Linear model for cleared counts in West"
## 
## Call:
## lm(formula = SUM_CLEARED_COUNT ~ DATA_YEAR, data = region_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2035.3  -300.4  -160.3   672.6  1788.6 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -3938378     322643  -12.21 5.67e-06 ***
## DATA_YEAR       1956        160   12.23 5.60e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1239 on 7 degrees of freedom
## Multiple R-squared:  0.9553, Adjusted R-squared:  0.9489 
## F-statistic: 149.5 on 1 and 7 DF,  p-value: 5.603e-06
## 
## [1] "Linear model for juvenile cleared counts in West"
## 
## Call:
## lm(formula = SUM_JUVENILE_CLEARED_COUNT ~ DATA_YEAR, data = region_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -395.36 -314.29 -271.62  -96.69 1870.24 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)
## (Intercept)  17572.956 198753.277   0.088    0.932
## DATA_YEAR       -8.533     98.539  -0.087    0.933
## 
## Residual standard error: 763.3 on 7 degrees of freedom
## Multiple R-squared:  0.00107,    Adjusted R-squared:  -0.1416 
## F-statistic: 0.007499 on 1 and 7 DF,  p-value: 0.9334

fit_linear_models("South")

## [1] "Linear model for actual counts in South"
## 
## Call:
## lm(formula = SUM_ACTUAL_COUNT ~ DATA_YEAR, data = region_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10266.8  -6988.0    791.3   2713.6  10520.5 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.216e+07  1.993e+06  -6.104 0.000489 ***
## DATA_YEAR    6.045e+03  9.879e+02   6.119 0.000482 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7652 on 7 degrees of freedom
## Multiple R-squared:  0.8425, Adjusted R-squared:   0.82 
## F-statistic: 37.45 on 1 and 7 DF,  p-value: 0.0004818
## 
## [1] "Linear model for cleared counts in South"
## 
## Call:
## lm(formula = SUM_CLEARED_COUNT ~ DATA_YEAR, data = region_data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -6936  -3558   -913   2618   9862 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)   
## (Intercept) -5439938.3  1385955.8  -3.925  0.00571 **
## DATA_YEAR       2704.3      687.1   3.936  0.00564 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5323 on 7 degrees of freedom
## Multiple R-squared:  0.6887, Adjusted R-squared:  0.6443 
## F-statistic: 15.49 on 1 and 7 DF,  p-value: 0.005635
## 
## [1] "Linear model for juvenile cleared counts in South"
## 
## Call:
## lm(formula = SUM_JUVENILE_CLEARED_COUNT ~ DATA_YEAR, data = region_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1019.8  -820.4  -604.7  -356.0  4811.9 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)
## (Intercept) -337738.4   509801.8  -0.662    0.529
## DATA_YEAR       168.2      252.8   0.665    0.527
## 
## Residual standard error: 1958 on 7 degrees of freedom
## Multiple R-squared:  0.05947,    Adjusted R-squared:  -0.07489 
## F-statistic: 0.4426 on 1 and 7 DF,  p-value: 0.5272

fit_linear_models("Northeast")

## [1] "Linear model for actual counts in Northeast"
## 
## Call:
## lm(formula = SUM_ACTUAL_COUNT ~ DATA_YEAR, data = region_data)
## 
## Residuals:
##       1       2       3       4       5       6       7 
## -633.14 -173.86  -76.57 1647.71  468.00 -904.71 -327.43 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)  
## (Intercept) -814738.1   353527.3  -2.305   0.0694 .
## DATA_YEAR       404.7      175.2   2.310   0.0689 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 927 on 5 degrees of freedom
## Multiple R-squared:  0.5163, Adjusted R-squared:  0.4196 
## F-statistic: 5.337 on 1 and 5 DF,  p-value: 0.06889
## 
## [1] "Linear model for cleared counts in Northeast"
## 
## Call:
## lm(formula = SUM_CLEARED_COUNT ~ DATA_YEAR, data = region_data)
## 
## Residuals:
##      1      2      3      4      5      6      7 
## -313.7 -287.8  469.1  253.0  523.9 -362.2 -282.3 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)
## (Intercept) -193278.21  166290.92  -1.162    0.298
## DATA_YEAR        96.11      82.40   1.166    0.296
## 
## Residual standard error: 436 on 5 degrees of freedom
## Multiple R-squared:  0.2139, Adjusted R-squared:  0.05664 
## F-statistic:  1.36 on 1 and 5 DF,  p-value: 0.2961
## 
## [1] "Linear model for juvenile cleared counts in Northeast"
## 
## Call:
## lm(formula = SUM_JUVENILE_CLEARED_COUNT ~ DATA_YEAR, data = region_data)
## 
## Residuals:
##       1       2       3       4       5       6       7 
##  -4.571  -9.143 -13.714  45.714 -22.857  36.571 -32.000 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)
## (Intercept) -9206.857  12376.834  -0.744     0.49
## DATA_YEAR       4.571      6.133   0.745     0.49
## 
## Residual standard error: 32.45 on 5 degrees of freedom
## Multiple R-squared:    0.1,  Adjusted R-squared:  -0.08 
## F-statistic: 0.5556 on 1 and 5 DF,  p-value: 0.4896

#convert year to date 
htd$DATA_YEAR <- as.Date(paste(htd$DATA_YEAR, "-01-01", sep = ""), format = "%Y-%m-%d")

htd_ <- htd |>
  select(DATA_YEAR, ACTUAL_COUNT, JUVENILE_CLEARED_COUNT) |>
  distinct()
htd_ <- htd_|>
  group_by(DATA_YEAR) |>
  summarise(SUM_ACTUAL_COUNT = sum(ACTUAL_COUNT),
            SUM_JUVENILE_CLEARED_COUNT = sum(JUVENILE_CLEARED_COUNT))

htd_ts <- htd_ |>
  as_tsibble(index = DATA_YEAR, key = "SUM_ACTUAL_COUNT")

htd_xts <- xts(x = htd_ts$SUM_ACTUAL_COUNT, order.by = htd_ts$DATA_YEAR)

# Set the column name
htd_xts <- setNames(htd_xts, "actual_counts")

#finding rolling average
htd_ts |>
  filter_index("2013-01" ~ "2021-01") |>
  ggplot(mapping = aes(x = DATA_YEAR, y = SUM_ACTUAL_COUNT)) +
  geom_line() +
  geom_smooth(method = 'lm', color = 'blue', se=FALSE) +
  labs(title = "Actual Counts from January 2013 to January 2021") +
  theme_minimal()

## `geom_smooth()` using formula 'y ~ x'

The presented chart, derived from the tsibble object spanning the years 2013 to 2021, visually encapsulates a discernible upward trend in human trafficking counts. Over this period, the plotted data points exhibit a consistent increase, indicating a rise in the incidence of human trafficking. This trend analysis serves as a valuable insight into the longitudinal dynamics of the issue, potentially prompting further exploration into the contributing factors or necessitating heightened awareness and intervention efforts during the specified timeframe.

# Assuming htd_ is your data frame
htd_ts |>
  filter(DATA_YEAR >= as.Date("2013-01-01") & DATA_YEAR <= as.Date("2021-01-01")) |>
  ggplot(mapping = aes(x = DATA_YEAR, y = SUM_ACTUAL_COUNT)) +
  geom_line() +
  geom_smooth(method = 'lm', color = 'blue', se=FALSE) +
  labs(title = "Actual Counts Over Time",
       subtitle = "Linear Model for 2021-2022") +
  theme_minimal()

## `geom_smooth()` using formula 'y ~ x'

# Assuming htd_ts is your time-series data frame
htd_ts %>%
  filter(!is.na(SUM_ACTUAL_COUNT)) %>%
  index_by(half_year = floor_date(DATA_YEAR, '6 months')) %>%
  summarise(avg_actual_counts = mean(SUM_ACTUAL_COUNT, na.rm = TRUE)) %>%
  ggplot(mapping = aes(x = half_year, y = avg_actual_counts)) +
  geom_line() +
  geom_smooth(span = 0.3, color = 'blue', se=FALSE) +
  labs(title = "Average Actual Counts Over Time",
       subtitle = "(by half year)") +
  scale_x_date(breaks = "1 year", labels = \(x) year(x)) +
  theme_minimal()

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : span too small. fewer data values than degrees of freedom.

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 15691

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 379.61

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 0

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 1.4486e+05

This chart visually captures the substantial surge in human trafficking counts over the 8-year period, highlighting a noteworthy upward trend in reported incidents. The distinct seasons revealed in the data indicate recurring peaks in human trafficking counts, with each year demonstrating a discernible surge. This cyclical pattern suggests a consistent and periodic escalation in reported cases, potentially influenced by factors such as law enforcement efforts, awareness campaigns, or other external variables. The visualization not only underscores the overall growth in human trafficking counts but also emphasizes the periodicity of these incidents, providing valuable insights for further investigation and analysis.

htd_ts %>%
  filter(!is.na(SUM_JUVENILE_CLEARED_COUNT)) %>%
  index_by(half_year = floor_date(DATA_YEAR, '6 months')) %>%
  summarise(avg_juvenile_counts = mean(SUM_JUVENILE_CLEARED_COUNT, na.rm = TRUE)) %>%
  ggplot(mapping = aes(x = half_year, y = avg_juvenile_counts)) +
  geom_line() +
  geom_smooth(span = 0.3, color = 'blue', se=FALSE) +
  labs(title = "Average juvenile cleared Counts Over Time",
       subtitle = "(by half year)") +
  scale_x_date(breaks = "1 year", labels = \(x) year(x)) +
  theme_minimal()

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : span too small. fewer data values than degrees of freedom.

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 15691

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 379.61

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 0

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 1.4486e+05

This comparison between the charts for juvenile cleared counts and actual counts unveils intriguing patterns and potential divergences in the dynamics of reported incidents over time. In the earlier chart, the relatively lower number of overall actual counts might suggest a period of decreased human trafficking activity. Correspondingly, the juvenile cleared counts chart aligns with this trend by illustrating the year 2017 as having the highest recorded cases of resolved incidents involving juveniles. However, the narrative takes a fascinating turn in the subsequent years. In the latest chart, the year 2021 stands out with a pronounced spike in overall actual counts, indicating a potential surge in human trafficking cases. Contrarily, the juvenile cleared counts for the same year show a decrease, presenting a paradoxical scenario where the overall counts rise while incidents involving juveniles decline. This nuanced interplay between the two charts beckons a deeper exploration into the contextual factors and underlying dynamics influencing these trends, unraveling a complex narrative within the broader landscape of human trafficking data.

# Assuming 'htd_ts' is your tsibble object
# Convert tsibble to a time series object
htd_cf <- ts(htd_ts$SUM_ACTUAL_COUNT, frequency = 1)

# Plot the ACF
acf(htd_cf, ci = 0.95, na.action = na.exclude, main = "ACF of Human Trafficking Counts")

# Plot the PACF
pacf(htd_cf, ci = 0.95, na.action = na.exclude, main = "PACF of Human Trafficking Counts")

Positive peaks may signify periods of increased counts, hinting at potential seasonality or periodic trends. Conversely, negative peaks could imply alternating patterns or cycles in human trafficking occurrences. The decay in correlation as lag increases is expected but notable peaks may reveal consistent intervals of heightened activity. In summary, the ACF analysis aids in unraveling the temporal dynamics, periodicity, and potential seasonality inherent in the reported human trafficking counts over the observed time period.

The PACF plot provides insights into the direct impact of each lag on the human trafficking counts, isolating the specific influence of a given time interval. Significant spikes at certain lags may indicate distinct patterns that persist without interference from other lags. Positive peaks at specific lags suggest a direct positive impact, emphasizing potential recurring patterns in reported incidents. Conversely, negative peaks may signal a direct negative influence, highlighting periods of decreased counts. As with the ACF, the decay in correlation over lags is expected, but identifying prominent spikes in the PACF helps uncover specific temporal relationships. In essence, the PACF analysis refines our understanding of how individual lags contribute to the observed patterns in human trafficking counts, offering a nuanced perspective on the temporal dynamics of the data.