set up
setwd("C:/Users/ftuja/OneDrive/Datasets UNH")
options(scipen = 99999, digits = 3, knitr.table.format = "rst", length = 120)
remove(list = ls()) # clear the environment panel
pacman::p_load(tidyverse, tidytext, textclean, tokenizers, markovchain)
pacman::p_load(stm, rvest, tm)
pacman::p_load(gutenbergr)
webscraping the first website
webpage_c = read_html("https://www.elitepersonalfinance.com/safest-states-fraud-scam/")
tab1 = webpage_c %>% html_nodes(".rank_state_fraud_score") %>% .[[1]] %>%
html_table(fill = TRUE) %>% as.data.frame()
colnames(tab1) <- as.character(tab1[1,])
tab1 <- data.frame(tab1)
#It is important to note that for the fraud score, the lower the score the better
Analyzing the first website
summary(tab1)
## Rank. State. Total.Number.of.Fraud.Reports.
## Length:51 Length:51 Length:51
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
## Fraud.Score.
## Length:51
## Class :character
## Mode :character
head(tab1)
## Rank. State. Total.Number.of.Fraud.Reports. Fraud.Score.
## 1 Rank: State: Total Number of Fraud Reports: Fraud Score:
## 2 1 North Dakota 2,090 2.77
## 3 2 South Dakota 2,689 3.09
## 4 3 Iowa 10,370 3.30
## 5 4 Vermont 2,417 3.88
## 6 5 Nebraska 7,590 3.95
#What is interesting is that the number of reported cases does not seem to match up to the fraud score. We can see this in the state of Iowa where the number of reports is 10370 but the fraud score is a low 3.30. This could be that the total number of fraud reports are just reports, not necessarily all confirmed fraud cases.
Getting and Analyzing the second set of data
ht <- read.csv("C:/Users/ftuja/OneDrive/Datasets UNH/htbystate.csv")
summary(ht)
## State reportedTraffickingCasesPerCapita reportedTraffickingCases
## Length:50 Min. :1.09 Min. : 9
## Class :character 1st Qu.:2.13 1st Qu.: 42
## Mode :character Median :2.79 Median : 120
## Mean :2.83 Mean : 203
## 3rd Qu.:3.37 3rd Qu.: 238
## Max. :7.50 Max. :1507
head(ht)
## State reportedTraffickingCasesPerCapita reportedTraffickingCases
## 1 Nevada 7.50 239
## 2 Mississippi 4.99 148
## 3 Florida 4.08 896
## 4 Georgia 3.85 417
## 5 Ohio 3.84 450
## 6 Delaware 3.84 38
#We could possibly see the same trend in this dataset as the other one that the number of reported cases might differ from the fraud score in the other dataset.
We could possibly see the same trend in this dataset as the other one that the number of reported cases might differ from the fraud score in the other dataset. So to compare the two data sets, I will combine them and compare the human trafficking per capita cases versus the fraud score in the first table
joining the data: to analyze fraud score vs humantraffickingpercapita
#joining data
join1 <- full_join(tab1, ht, by = c("State." = "State"))
View(join1)
#getting rid of columns and rows that are not necessary
join1 <- join1[-1,-1]
view(join1)
join2 <- join1[-c(2)]
join2
## State. Fraud.Score. reportedTraffickingCasesPerCapita
## 2 North Dakota 2.77 2.99
## 3 South Dakota 3.09 2.79
## 4 Iowa 3.30 3.09
## 5 Vermont 3.88 1.44
## 6 Nebraska 3.95 3.18
## 7 Alaska 4.06 2.07
## 8 Utah 4.13 2.72
## 9 Wyoming 4.14 2.07
## 10 Maine 4.23 2.66
## 11 Hawaii 4.28 2.49
## 12 Minnesota 4.34 1.82
## 13 Wisconsin 4.52 1.61
## 14 Montana 4.73 3.50
## 15 Idaho 4.77 1.40
## 16 Oklahoma 4.78 2.73
## 17 Massachusetts 4.81 1.55
## 18 Arkansas 4.81 2.83
## 19 West Virginia 4.93 2.15
## 20 Kansas 5.01 3.15
## 21 Kentucky 5.02 3.04
## 22 New Hampshire 5.09 1.09
## 23 New York 5.10 2.35
## 24 Oregon 5.12 3.08
## 25 Mississippi 5.14 4.99
## 26 Illinois 5.18 2.12
## 27 Indiana 5.21 2.31
## 28 Washington 5.21 3.49
## 29 Connecticut 5.41 1.46
## 30 New Jersey 5.56 2.78
## 31 Pennsylvania 5.60 2.12
## 32 Rhode Island 5.61 1.32
## 33 Colorado 5.68 2.99
## 34 California 5.70 3.80
## 35 Louisiana 5.82 3.44
## 36 North Carolina 5.87 2.49
## 37 New Mexico 5.90 3.04
## 38 Ohio 5.98 3.84
## 39 Virginia 6.13 2.20
## 40 Missouri 6.40 3.78
## 41 Arizona 6.44 3.11
## 42 Tennessee 6.49 2.59
## 43 South Carolina 6.60 2.63
## 44 Alabama 6.87 1.66
## 45 Maryland 6.94 3.08
## 46 Texas 7.29 3.63
## 47 Michigan 7.50 3.64
## 48 Delaware 7.58 3.84
## 49 Nevada 7.70 7.50
## 50 Georgia 9.24 3.85
## 51 Florida 9.93 4.08
## reportedTraffickingCases
## 2 23
## 3 25
## 4 98
## 5 9
## 6 62
## 7 15
## 8 90
## 9 12
## 10 36
## 11 35
## 12 104
## 13 94
## 14 38
## 15 26
## 16 109
## 17 107
## 18 86
## 19 38
## 20 92
## 21 136
## 22 15
## 23 454
## 24 132
## 25 148
## 26 267
## 27 157
## 28 272
## 29 52
## 30 247
## 31 271
## 32 14
## 33 176
## 34 1507
## 35 159
## 36 266
## 37 64
## 38 450
## 39 189
## 40 233
## 41 234
## 42 180
## 43 139
## 44 82
## 45 187
## 46 1080
## 47 364
## 48 38
## 49 239
## 50 417
## 51 896
graphing all of US to see if there is a trend
join2 %>% select(State., everything()) %>%
select(-"reportedTraffickingCases") %>%
gather(key, value,-State. ) %>%
ggplot(aes(State., value, fill = as.factor(key))) +
geom_col(position = "dodge")+
coord_flip()
graphing New England States to take a closer look
join2 %>% select(State., everything()) %>%
filter(State. %in% c("Connecticut", "Massachusetts", "New Hampshire","Rhode Island", "Maine")) %>%
select(-"reportedTraffickingCases") %>%
gather(key, value,-State. ) %>%
ggplot(aes(State., value, fill = as.factor(key))) +
geom_col(position = "dodge")+
coord_flip()
Conclusion
there seems not to be a correlation between reported trafficking cases per capita and fraud score. This is surprising as I would think That if a state has a lower fraud score meaning it would be safer that they would also have a low reporting of trafficking cases. What could result in this is that the number of reports is not always the defined number of cases. It could also be that human trafficking is just one type of case that can involve fraud. There are many different types of frauds and human trafficking is just one small subset of this topic.