set up

setwd("C:/Users/ftuja/OneDrive/Datasets UNH")
options(scipen = 99999, digits = 3, knitr.table.format = "rst", length = 120)

remove(list = ls())  # clear the environment panel

pacman::p_load(tidyverse, tidytext, textclean, tokenizers, markovchain)
pacman::p_load(stm, rvest, tm)
pacman::p_load(gutenbergr)

webscraping the first website

webpage_c = read_html("https://www.elitepersonalfinance.com/safest-states-fraud-scam/")
tab1  = webpage_c %>% html_nodes(".rank_state_fraud_score") %>% .[[1]] %>% 
  html_table(fill = TRUE) %>% as.data.frame()

colnames(tab1) <- as.character(tab1[1,])
tab1 <- data.frame(tab1)

#It is important to note that for the fraud score, the lower the score the better

Analyzing the first website

summary(tab1)
##     Rank.              State.          Total.Number.of.Fraud.Reports.
##  Length:51          Length:51          Length:51                     
##  Class :character   Class :character   Class :character              
##  Mode  :character   Mode  :character   Mode  :character              
##  Fraud.Score.      
##  Length:51         
##  Class :character  
##  Mode  :character
head(tab1)
##   Rank.       State. Total.Number.of.Fraud.Reports. Fraud.Score.
## 1 Rank:       State: Total Number of Fraud Reports: Fraud Score:
## 2     1 North Dakota                          2,090         2.77
## 3     2 South Dakota                          2,689         3.09
## 4     3         Iowa                         10,370         3.30
## 5     4      Vermont                          2,417         3.88
## 6     5     Nebraska                          7,590         3.95
#What is interesting is that the number of reported cases does not seem to match up to the fraud score. We can see this in the state of Iowa where the number of reports is 10370 but the fraud score is a low 3.30. This could be that the total number of fraud reports are just reports, not necessarily all confirmed fraud cases. 

Getting and Analyzing the second set of data

ht <- read.csv("C:/Users/ftuja/OneDrive/Datasets UNH/htbystate.csv")

summary(ht)
##     State           reportedTraffickingCasesPerCapita reportedTraffickingCases
##  Length:50          Min.   :1.09                      Min.   :   9            
##  Class :character   1st Qu.:2.13                      1st Qu.:  42            
##  Mode  :character   Median :2.79                      Median : 120            
##                     Mean   :2.83                      Mean   : 203            
##                     3rd Qu.:3.37                      3rd Qu.: 238            
##                     Max.   :7.50                      Max.   :1507
head(ht)
##         State reportedTraffickingCasesPerCapita reportedTraffickingCases
## 1      Nevada                              7.50                      239
## 2 Mississippi                              4.99                      148
## 3     Florida                              4.08                      896
## 4     Georgia                              3.85                      417
## 5        Ohio                              3.84                      450
## 6    Delaware                              3.84                       38
#We could possibly see the same trend in this dataset as the other one that the number of reported cases might differ from the fraud score in the other dataset.

We could possibly see the same trend in this dataset as the other one that the number of reported cases might differ from the fraud score in the other dataset. So to compare the two data sets, I will combine them and compare the human trafficking per capita cases versus the fraud score in the first table

joining the data: to analyze fraud score vs humantraffickingpercapita

#joining data
join1 <- full_join(tab1, ht, by = c("State." = "State"))
View(join1)

#getting rid of columns and rows that are not necessary
join1 <- join1[-1,-1]
view(join1)


join2 <- join1[-c(2)]
join2
##            State. Fraud.Score. reportedTraffickingCasesPerCapita
## 2    North Dakota         2.77                              2.99
## 3    South Dakota         3.09                              2.79
## 4            Iowa         3.30                              3.09
## 5         Vermont         3.88                              1.44
## 6        Nebraska         3.95                              3.18
## 7          Alaska         4.06                              2.07
## 8            Utah         4.13                              2.72
## 9         Wyoming         4.14                              2.07
## 10          Maine         4.23                              2.66
## 11         Hawaii         4.28                              2.49
## 12      Minnesota         4.34                              1.82
## 13      Wisconsin         4.52                              1.61
## 14        Montana         4.73                              3.50
## 15          Idaho         4.77                              1.40
## 16       Oklahoma         4.78                              2.73
## 17  Massachusetts         4.81                              1.55
## 18       Arkansas         4.81                              2.83
## 19  West Virginia         4.93                              2.15
## 20         Kansas         5.01                              3.15
## 21       Kentucky         5.02                              3.04
## 22  New Hampshire         5.09                              1.09
## 23       New York         5.10                              2.35
## 24         Oregon         5.12                              3.08
## 25    Mississippi         5.14                              4.99
## 26       Illinois         5.18                              2.12
## 27        Indiana         5.21                              2.31
## 28     Washington         5.21                              3.49
## 29    Connecticut         5.41                              1.46
## 30     New Jersey         5.56                              2.78
## 31   Pennsylvania         5.60                              2.12
## 32   Rhode Island         5.61                              1.32
## 33       Colorado         5.68                              2.99
## 34     California         5.70                              3.80
## 35      Louisiana         5.82                              3.44
## 36 North Carolina         5.87                              2.49
## 37     New Mexico         5.90                              3.04
## 38           Ohio         5.98                              3.84
## 39       Virginia         6.13                              2.20
## 40       Missouri         6.40                              3.78
## 41        Arizona         6.44                              3.11
## 42      Tennessee         6.49                              2.59
## 43 South Carolina         6.60                              2.63
## 44        Alabama         6.87                              1.66
## 45       Maryland         6.94                              3.08
## 46          Texas         7.29                              3.63
## 47       Michigan         7.50                              3.64
## 48       Delaware         7.58                              3.84
## 49         Nevada         7.70                              7.50
## 50        Georgia         9.24                              3.85
## 51        Florida         9.93                              4.08
##    reportedTraffickingCases
## 2                        23
## 3                        25
## 4                        98
## 5                         9
## 6                        62
## 7                        15
## 8                        90
## 9                        12
## 10                       36
## 11                       35
## 12                      104
## 13                       94
## 14                       38
## 15                       26
## 16                      109
## 17                      107
## 18                       86
## 19                       38
## 20                       92
## 21                      136
## 22                       15
## 23                      454
## 24                      132
## 25                      148
## 26                      267
## 27                      157
## 28                      272
## 29                       52
## 30                      247
## 31                      271
## 32                       14
## 33                      176
## 34                     1507
## 35                      159
## 36                      266
## 37                       64
## 38                      450
## 39                      189
## 40                      233
## 41                      234
## 42                      180
## 43                      139
## 44                       82
## 45                      187
## 46                     1080
## 47                      364
## 48                       38
## 49                      239
## 50                      417
## 51                      896

graphing all of US to see if there is a trend

join2  %>% select(State., everything()) %>%
  select(-"reportedTraffickingCases") %>%
  gather(key, value,-State. ) %>% 
  ggplot(aes(State., value, fill = as.factor(key))) + 
  geom_col(position = "dodge")+
  coord_flip()

graphing New England States to take a closer look

join2  %>% select(State., everything()) %>%
  filter(State. %in% c("Connecticut", "Massachusetts", "New Hampshire","Rhode Island", "Maine")) %>%
  select(-"reportedTraffickingCases") %>%
  gather(key, value,-State. ) %>% 
  ggplot(aes(State., value, fill = as.factor(key))) + 
  geom_col(position = "dodge")+
  coord_flip()

Conclusion

there seems not to be a correlation between reported trafficking cases per capita and fraud score. This is surprising as I would think That if a state has a lower fraud score meaning it would be safer that they would also have a low reporting of trafficking cases. What could result in this is that the number of reports is not always the defined number of cases. It could also be that human trafficking is just one type of case that can involve fraud. There are many different types of frauds and human trafficking is just one small subset of this topic.