library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(ggthemes)
## Warning: package 'ggthemes' was built under R version 4.2.3
library(ggrepel)
# time series toolkits
library(xts)
## Warning: package 'xts' was built under R version 4.2.3
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
##
## ######################### Warning from 'xts' package ##########################
## # #
## # The dplyr lag() function breaks how base R's lag() function is supposed to #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or #
## # source() into this session won't work correctly. #
## # #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop #
## # dplyr from breaking base R's lag() function. #
## # #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning. #
## # #
## ###############################################################################
##
## Attaching package: 'xts'
##
## The following objects are masked from 'package:dplyr':
##
## first, last
library(tsibble)
## Warning: package 'tsibble' was built under R version 4.2.3
##
## Attaching package: 'tsibble'
##
## The following object is masked from 'package:zoo':
##
## index
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, union
library(lubridate)
##
## Attaching package: 'lubridate'
##
## The following object is masked from 'package:tsibble':
##
## interval
##
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
htd <- read.csv("C:\\Users\\moore\\OneDrive\\Desktop\\Fall 2023\\Intro to statistics\\project\\Statistics Project\\Statistics Project\\htd_transformed.csv")
anova <- aov(CLEARED_COUNT ~ ACTUAL_COUNT, data = htd)
summary(anova)
## Df Sum Sq Mean Sq F value Pr(>F)
## ACTUAL_COUNT 1 256436013 256436013 7635 <2e-16 ***
## Residuals 3096 103980167 33585
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova2 <-aov(JUVENILE_CLEARED_COUNT ~ ACTUAL_COUNT, data = htd)
summary(anova2)
## Df Sum Sq Mean Sq F value Pr(>F)
## ACTUAL_COUNT 1 688205 688205 187.6 <2e-16 ***
## Residuals 3096 11360394 3669
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
htd |>
ggplot(aes(x = CLEARED_COUNT, y = ACTUAL_COUNT, color = REGION_NAME)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "blue") +
labs(title = "Scatter Plot and Regression Lines",
subtitle = "",
x = "CLEARED_COUNT", y = "ACTUAL_COUNT") +
theme_minimal()
## `geom_smooth()` using formula 'y ~ x'
htd |>
ggplot(aes(x = JUVENILE_CLEARED_COUNT, y = ACTUAL_COUNT, color = REGION_NAME)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "blue") +
labs(title = "Scatter Plot and Regression Lines",
subtitle = "",
x = "Juvenile clear counts", y = "Actual counts") +
theme_minimal()
## `geom_smooth()` using formula 'y ~ x'
htd_1 <- htd |>
select(DATA_YEAR, REGION_NAME, ACTUAL_COUNT, CLEARED_COUNT, JUVENILE_CLEARED_COUNT) |>
distinct()
htd_1 <- htd_1|>
group_by(DATA_YEAR, REGION_NAME) |>
summarise(SUM_ACTUAL_COUNT = sum(ACTUAL_COUNT),
SUM_JUVENILE_CLEARED_COUNT = sum(JUVENILE_CLEARED_COUNT),
SUM_CLEARED_COUNT = sum(CLEARED_COUNT))
## `summarise()` has grouped output by 'DATA_YEAR'. You can override using the
## `.groups` argument.
htd_ts1 <- htd_1 |>
as_tibble(index = DATA_YEAR, key = c("SUM_ACTUAL_COUNT", "SUM_JUVENILE_CLEARED_COUNT", "SUM_CLEARED_COUNT"))
fit_linear_models <- function(region_name) {
region_data <- htd_ts1 %>% filter(REGION_NAME == region_name)
# Linear model for actual counts
actual_counts_model <- lm(SUM_ACTUAL_COUNT ~ DATA_YEAR, data = region_data)
print(paste("Linear model for actual counts in", region_name))
print(summary(actual_counts_model))
# Linear model for cleared counts
cleared_counts_model <- lm(SUM_CLEARED_COUNT ~ DATA_YEAR, data = region_data)
print(paste("Linear model for cleared counts in", region_name))
print(summary(cleared_counts_model))
# Linear model for juvenile cleared counts
juvenile_cleared_counts_model <- lm(SUM_JUVENILE_CLEARED_COUNT ~ DATA_YEAR, data = region_data)
print(paste("Linear model for juvenile cleared counts in", region_name))
print(summary(juvenile_cleared_counts_model))
}
# Apply the function to each region
fit_linear_models("Midwest")
## [1] "Linear model for actual counts in Midwest"
##
## Call:
## lm(formula = SUM_ACTUAL_COUNT ~ DATA_YEAR, data = region_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4917.9 -2586.1 -869.9 2616.9 6867.0
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3378490.2 1108880.4 -3.047 0.0187 *
## DATA_YEAR 1681.0 549.8 3.058 0.0184 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4258 on 7 degrees of freedom
## Multiple R-squared: 0.5719, Adjusted R-squared: 0.5107
## F-statistic: 9.35 on 1 and 7 DF, p-value: 0.01838
##
## [1] "Linear model for cleared counts in Midwest"
##
## Call:
## lm(formula = SUM_CLEARED_COUNT ~ DATA_YEAR, data = region_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4409 -3226 -835 3016 5803
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1570460.9 1056457.9 -1.487 0.181
## DATA_YEAR 782.0 523.8 1.493 0.179
##
## Residual standard error: 4057 on 7 degrees of freedom
## Multiple R-squared: 0.2415, Adjusted R-squared: 0.1332
## F-statistic: 2.229 on 1 and 7 DF, p-value: 0.1791
##
## [1] "Linear model for juvenile cleared counts in Midwest"
##
## Call:
## lm(formula = SUM_JUVENILE_CLEARED_COUNT ~ DATA_YEAR, data = region_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -63.044 -32.411 -8.578 45.889 72.656
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -9532.256 14211.198 -0.671 0.524
## DATA_YEAR 4.767 7.046 0.677 0.520
##
## Residual standard error: 54.58 on 7 degrees of freedom
## Multiple R-squared: 0.06137, Adjusted R-squared: -0.07272
## F-statistic: 0.4577 on 1 and 7 DF, p-value: 0.5204
fit_linear_models("West")
## [1] "Linear model for actual counts in West"
##
## Call:
## lm(formula = SUM_ACTUAL_COUNT ~ DATA_YEAR, data = region_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4736.2 -2430.3 -274.2 3264.7 6140.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7288967.4 1075766.1 -6.776 0.000259 ***
## DATA_YEAR 3622.0 533.3 6.791 0.000255 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4131 on 7 degrees of freedom
## Multiple R-squared: 0.8682, Adjusted R-squared: 0.8494
## F-statistic: 46.12 on 1 and 7 DF, p-value: 0.0002553
##
## [1] "Linear model for cleared counts in West"
##
## Call:
## lm(formula = SUM_CLEARED_COUNT ~ DATA_YEAR, data = region_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2035.3 -300.4 -160.3 672.6 1788.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3938378 322643 -12.21 5.67e-06 ***
## DATA_YEAR 1956 160 12.23 5.60e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1239 on 7 degrees of freedom
## Multiple R-squared: 0.9553, Adjusted R-squared: 0.9489
## F-statistic: 149.5 on 1 and 7 DF, p-value: 5.603e-06
##
## [1] "Linear model for juvenile cleared counts in West"
##
## Call:
## lm(formula = SUM_JUVENILE_CLEARED_COUNT ~ DATA_YEAR, data = region_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -395.36 -314.29 -271.62 -96.69 1870.24
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 17572.956 198753.277 0.088 0.932
## DATA_YEAR -8.533 98.539 -0.087 0.933
##
## Residual standard error: 763.3 on 7 degrees of freedom
## Multiple R-squared: 0.00107, Adjusted R-squared: -0.1416
## F-statistic: 0.007499 on 1 and 7 DF, p-value: 0.9334
fit_linear_models("South")
## [1] "Linear model for actual counts in South"
##
## Call:
## lm(formula = SUM_ACTUAL_COUNT ~ DATA_YEAR, data = region_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10266.8 -6988.0 791.3 2713.6 10520.5
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.216e+07 1.993e+06 -6.104 0.000489 ***
## DATA_YEAR 6.045e+03 9.879e+02 6.119 0.000482 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7652 on 7 degrees of freedom
## Multiple R-squared: 0.8425, Adjusted R-squared: 0.82
## F-statistic: 37.45 on 1 and 7 DF, p-value: 0.0004818
##
## [1] "Linear model for cleared counts in South"
##
## Call:
## lm(formula = SUM_CLEARED_COUNT ~ DATA_YEAR, data = region_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6936 -3558 -913 2618 9862
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5439938.3 1385955.8 -3.925 0.00571 **
## DATA_YEAR 2704.3 687.1 3.936 0.00564 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5323 on 7 degrees of freedom
## Multiple R-squared: 0.6887, Adjusted R-squared: 0.6443
## F-statistic: 15.49 on 1 and 7 DF, p-value: 0.005635
##
## [1] "Linear model for juvenile cleared counts in South"
##
## Call:
## lm(formula = SUM_JUVENILE_CLEARED_COUNT ~ DATA_YEAR, data = region_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1019.8 -820.4 -604.7 -356.0 4811.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -337738.4 509801.8 -0.662 0.529
## DATA_YEAR 168.2 252.8 0.665 0.527
##
## Residual standard error: 1958 on 7 degrees of freedom
## Multiple R-squared: 0.05947, Adjusted R-squared: -0.07489
## F-statistic: 0.4426 on 1 and 7 DF, p-value: 0.5272
fit_linear_models("Northeast")
## [1] "Linear model for actual counts in Northeast"
##
## Call:
## lm(formula = SUM_ACTUAL_COUNT ~ DATA_YEAR, data = region_data)
##
## Residuals:
## 1 2 3 4 5 6 7
## -633.14 -173.86 -76.57 1647.71 468.00 -904.71 -327.43
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -814738.1 353527.3 -2.305 0.0694 .
## DATA_YEAR 404.7 175.2 2.310 0.0689 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 927 on 5 degrees of freedom
## Multiple R-squared: 0.5163, Adjusted R-squared: 0.4196
## F-statistic: 5.337 on 1 and 5 DF, p-value: 0.06889
##
## [1] "Linear model for cleared counts in Northeast"
##
## Call:
## lm(formula = SUM_CLEARED_COUNT ~ DATA_YEAR, data = region_data)
##
## Residuals:
## 1 2 3 4 5 6 7
## -313.7 -287.8 469.1 253.0 523.9 -362.2 -282.3
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -193278.21 166290.92 -1.162 0.298
## DATA_YEAR 96.11 82.40 1.166 0.296
##
## Residual standard error: 436 on 5 degrees of freedom
## Multiple R-squared: 0.2139, Adjusted R-squared: 0.05664
## F-statistic: 1.36 on 1 and 5 DF, p-value: 0.2961
##
## [1] "Linear model for juvenile cleared counts in Northeast"
##
## Call:
## lm(formula = SUM_JUVENILE_CLEARED_COUNT ~ DATA_YEAR, data = region_data)
##
## Residuals:
## 1 2 3 4 5 6 7
## -4.571 -9.143 -13.714 45.714 -22.857 36.571 -32.000
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -9206.857 12376.834 -0.744 0.49
## DATA_YEAR 4.571 6.133 0.745 0.49
##
## Residual standard error: 32.45 on 5 degrees of freedom
## Multiple R-squared: 0.1, Adjusted R-squared: -0.08
## F-statistic: 0.5556 on 1 and 5 DF, p-value: 0.4896
#convert year to date
htd$DATA_YEAR <- as.Date(paste(htd$DATA_YEAR, "-01-01", sep = ""), format = "%Y-%m-%d")
htd_ <- htd |>
select(DATA_YEAR, ACTUAL_COUNT, JUVENILE_CLEARED_COUNT) |>
distinct()
htd_ <- htd_|>
group_by(DATA_YEAR) |>
summarise(SUM_ACTUAL_COUNT = sum(ACTUAL_COUNT),
SUM_JUVENILE_CLEARED_COUNT = sum(JUVENILE_CLEARED_COUNT))
htd_ts <- htd_ |>
as_tsibble(index = DATA_YEAR, key = "SUM_ACTUAL_COUNT")
htd_xts <- xts(x = htd_ts$SUM_ACTUAL_COUNT, order.by = htd_ts$DATA_YEAR)
# Set the column name
htd_xts <- setNames(htd_xts, "actual_counts")
#finding rolling average
htd_ts |>
filter_index("2013-01" ~ "2021-01") |>
ggplot(mapping = aes(x = DATA_YEAR, y = SUM_ACTUAL_COUNT)) +
geom_line() +
geom_smooth(method = 'lm', color = 'blue', se=FALSE) +
labs(title = "Actual Counts from January 2013 to January 2021") +
theme_minimal()
## `geom_smooth()` using formula 'y ~ x'
The presented chart, derived from the tsibble object spanning the years 2013 to 2021, visually encapsulates a discernible upward trend in human trafficking counts. Over this period, the plotted data points exhibit a consistent increase, indicating a rise in the incidence of human trafficking. This trend analysis serves as a valuable insight into the longitudinal dynamics of the issue, potentially prompting further exploration into the contributing factors or necessitating heightened awareness and intervention efforts during the specified timeframe.
# Assuming htd_ is your data frame
htd_ts |>
filter(DATA_YEAR >= as.Date("2013-01-01") & DATA_YEAR <= as.Date("2021-01-01")) |>
ggplot(mapping = aes(x = DATA_YEAR, y = SUM_ACTUAL_COUNT)) +
geom_line() +
geom_smooth(method = 'lm', color = 'blue', se=FALSE) +
labs(title = "Actual Counts Over Time",
subtitle = "Linear Model for 2021-2022") +
theme_minimal()
## `geom_smooth()` using formula 'y ~ x'
# Assuming htd_ts is your time-series data frame
htd_ts %>%
filter(!is.na(SUM_ACTUAL_COUNT)) %>%
index_by(half_year = floor_date(DATA_YEAR, '6 months')) %>%
summarise(avg_actual_counts = mean(SUM_ACTUAL_COUNT, na.rm = TRUE)) %>%
ggplot(mapping = aes(x = half_year, y = avg_actual_counts)) +
geom_line() +
geom_smooth(span = 0.3, color = 'blue', se=FALSE) +
labs(title = "Average Actual Counts Over Time",
subtitle = "(by half year)") +
scale_x_date(breaks = "1 year", labels = \(x) year(x)) +
theme_minimal()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : span too small. fewer data values than degrees of freedom.
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 15691
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 379.61
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 1.4486e+05
This chart visually captures the substantial surge in human trafficking
counts over the 8-year period, highlighting a noteworthy upward trend in
reported incidents. The distinct seasons revealed in the data indicate
recurring peaks in human trafficking counts, with each year
demonstrating a discernible surge. This cyclical pattern suggests a
consistent and periodic escalation in reported cases, potentially
influenced by factors such as law enforcement efforts, awareness
campaigns, or other external variables. The visualization not only
underscores the overall growth in human trafficking counts but also
emphasizes the periodicity of these incidents, providing valuable
insights for further investigation and analysis.
htd_ts %>%
filter(!is.na(SUM_JUVENILE_CLEARED_COUNT)) %>%
index_by(half_year = floor_date(DATA_YEAR, '6 months')) %>%
summarise(avg_juvenile_counts = mean(SUM_JUVENILE_CLEARED_COUNT, na.rm = TRUE)) %>%
ggplot(mapping = aes(x = half_year, y = avg_juvenile_counts)) +
geom_line() +
geom_smooth(span = 0.3, color = 'blue', se=FALSE) +
labs(title = "Average juvenile cleared Counts Over Time",
subtitle = "(by half year)") +
scale_x_date(breaks = "1 year", labels = \(x) year(x)) +
theme_minimal()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : span too small. fewer data values than degrees of freedom.
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 15691
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 379.61
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 1.4486e+05
This comparison between the charts for juvenile cleared counts and actual counts unveils intriguing patterns and potential divergences in the dynamics of reported incidents over time. In the earlier chart, the relatively lower number of overall actual counts might suggest a period of decreased human trafficking activity. Correspondingly, the juvenile cleared counts chart aligns with this trend by illustrating the year 2017 as having the highest recorded cases of resolved incidents involving juveniles. However, the narrative takes a fascinating turn in the subsequent years. In the latest chart, the year 2021 stands out with a pronounced spike in overall actual counts, indicating a potential surge in human trafficking cases. Contrarily, the juvenile cleared counts for the same year show a decrease, presenting a paradoxical scenario where the overall counts rise while incidents involving juveniles decline. This nuanced interplay between the two charts beckons a deeper exploration into the contextual factors and underlying dynamics influencing these trends, unraveling a complex narrative within the broader landscape of human trafficking data.
# Assuming 'htd_ts' is your tsibble object
# Convert tsibble to a time series object
htd_cf <- ts(htd_ts$SUM_ACTUAL_COUNT, frequency = 1)
# Plot the ACF
acf(htd_cf, ci = 0.95, na.action = na.exclude, main = "ACF of Human Trafficking Counts")
# Plot the PACF
pacf(htd_cf, ci = 0.95, na.action = na.exclude, main = "PACF of Human Trafficking Counts")
Positive peaks may signify periods of increased counts, hinting at
potential seasonality or periodic trends. Conversely, negative peaks
could imply alternating patterns or cycles in human trafficking
occurrences. The decay in correlation as lag increases is expected but
notable peaks may reveal consistent intervals of heightened activity. In
summary, the ACF analysis aids in unraveling the temporal dynamics,
periodicity, and potential seasonality inherent in the reported human
trafficking counts over the observed time period.
The PACF plot provides insights into the direct impact of each lag on the human trafficking counts, isolating the specific influence of a given time interval. Significant spikes at certain lags may indicate distinct patterns that persist without interference from other lags. Positive peaks at specific lags suggest a direct positive impact, emphasizing potential recurring patterns in reported incidents. Conversely, negative peaks may signal a direct negative influence, highlighting periods of decreased counts. As with the ACF, the decay in correlation over lags is expected, but identifying prominent spikes in the PACF helps uncover specific temporal relationships. In essence, the PACF analysis refines our understanding of how individual lags contribute to the observed patterns in human trafficking counts, offering a nuanced perspective on the temporal dynamics of the data.