The data set for this dashboard was downloaded from the worldwide web. This dashboard is for demonstration purposes only and the numbers can not be confirmed as factual.
The dashboard sets out to highlight stats and graphs representing fatalities due to USA police officer shootings - no names or addresses are contained within this dashboard.
We also need to take into account when looking at the numbers the percentage split by race in the USA varies, and in some cases, for a true and fair reflection the numbers would need to be weighted according the the percentage of population made up by different race groups - see section 5 in index for clarification
For reference the race split in the US is as follows:
Other notes regarding the dashboard
Finally - if you are interested in seeing the source code that drives this dashboard then click on the “source code” tab on the far right of the dashboard.
| date | manner_of_death | armed | age | gender | race | dv | state | smi | threat_level | flee | body_camera | longitude | latitude | number | year | wday | race_g | race_perc |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2022-09-02 | shot | gun | 47 | M | U | NW | WI | 0 | attack | Car | 0 | -87.912 | 43.046 | 1 | 2022 | Fri | Unkn | 0.0 |
| 2022-09-03 | shot | gun | 37 | M | U | NW | WA | 0 | other | Not fleeing | 1 | -117.372 | 47.702 | 1 | 2022 | Sat | Unkn | 0.0 |
| 2022-09-03 | shot | gun | 41 | M | U | NW | CA | 0 | other | Not fleeing | 0 | -118.309 | 33.938 | 1 | 2022 | Sat | Unkn | 0.0 |
| 2022-09-03 | shot | knife | 40 | M | U | NW | NC | 1 | attack | Not fleeing | 0 | -78.994 | 35.055 | 1 | 2022 | Sat | Unkn | 0.0 |
| 2022-09-03 | shot | undetermined | 37 | M | U | NW | NJ | 0 | undetermined | No data | 0 | -73.982 | 40.896 | 1 | 2022 | Sat | Unkn | 0.0 |
| 2022-09-04 | shot | knife | 54 | M | U | NW | WA | 0 | other | Not fleeing | 0 | -122.304 | 47.335 | 1 | 2022 | Sun | Unkn | 0.0 |
| 2022-09-05 | shot | unknown weapon | 29 | M | U | NW | CO | 0 | other | Not fleeing | 0 | -104.989 | 39.916 | 1 | 2022 | Mon | Unkn | 0.0 |
| 2022-09-05 | shot | unarmed | 32 | M | U | NW | OH | 1 | attack | No data | 1 | -81.818 | 41.431 | 1 | 2022 | Mon | Unkn | 0.0 |
| 2022-09-05 | shot | gun | 63 | M | U | NW | AK | 0 | attack | Not fleeing | 0 | -149.429 | 61.596 | 1 | 2022 | Mon | Unkn | 0.0 |
| 2022-09-06 | shot | gun | 37 | M | U | NW | WI | 0 | other | Not fleeing | 1 | -89.944 | 44.025 | 1 | 2022 | Tue | Unkn | 0.0 |
| 2022-09-07 | shot | screwdriver | 37 | M | U | NW | AZ | 0 | other | Foot | 0 | -111.584 | 35.224 | 1 | 2022 | Wed | Unkn | 0.0 |
| 2022-09-07 | shot | gun | 37 | M | W | W | AL | 0 | attack | Car | 0 | -86.472 | 33.775 | 1 | 2022 | Wed | Wh | 62.0 |
| 2022-09-07 | shot | knife | 61 | M | B | NW | MO | 0 | other | Not fleeing | 0 | -90.290 | 38.668 | 1 | 2022 | Wed | B_A | 12.6 |
| 2022-09-07 | shot | gun | 27 | M | U | NW | TN | 0 | attack | Other | 0 | -82.539 | 36.309 | 1 | 2022 | Wed | Unkn | 0.0 |
| 2022-09-07 | shot | gun | 30 | M | U | NW | GA | 0 | attack | Foot | 0 | NA | NA | 1 | 2022 | Wed | Unkn | 0.0 |
weight<- final_data_set %>%
group_by(number) %>%
summarize(n = n())
weight2<- final_data_set %>%
group_by(race_g) %>%
summarize(tot_by_race = n())
final_weighting <-weight2 %>%
select(race_g, tot_by_race) %>%
mutate(race_perc = as.double(case_when(race_g == 'B_A' ~ 12.6, ## Blacks & Africans 12.6 % of total pop
race_g == 'Oth' ~ 8.5, ## etc
race_g == 'Wh' ~ 62.0,
race_g == 'Hisp' ~ 16.9, TRUE ~ 0.0))) %>%
mutate(tot = weight$n) %>%
mutate(pot = (tot_by_race/tot)*100) %>%
mutate(w_t = race_perc/pot) %>% ## weighted % = race_perc / pot(% of total)
mutate(wtnor = tot_by_race * w_t) %>% ## weighted number of race!
mutate(perc_diff = ((tot_by_race/wtnor)*100)-100)
#final_weighting %>%
#kable()| race_g | tot_by_race | race_perc | tot | pot | w_t | wtnor | perc_diff |
|---|---|---|---|---|---|---|---|
| B_A | 1799 | 12.6 | 7729 | 23.275974 | 0.5413307 | 973.854 | 84.72995 |
| Hisp | 1128 | 16.9 | 7729 | 14.594385 | 1.1579796 | 1306.201 | -13.64269 |
| Oth | 144 | 8.5 | 7729 | 1.863113 | 4.5622569 | 656.965 | -78.08102 |
| Unkn | 1460 | 0.0 | 7729 | 18.889895 | 0.0000000 | 0.000 | Inf |
| Wh | 3198 | 62.0 | 7729 | 41.376633 | 1.4984303 | 4791.980 | -33.26349 |
race_g F M U
B_A 65 (0.8410%) 1733 (22.4220%) 1 (0.0129%)
Hisp 33 (0.4270%) 1095 (14.1674%) 0 (0.0000%)
Oth 8 (0.1035%) 136 (1.7596%) 0 (0.0000%)
Unkn 54 (0.6987%) 1388 (17.9583%) 18 (0.2329%)
Wh 188 (2.4324%) 3009 (38.9313%) 1 (0.0129%)
race_g F M U
B_A 65 (18.6782%) 1733 (23.5430%) 1 (5.0000%)
Hisp 33 (9.4828%) 1095 (14.8757%) 0 (0.0000%)
Oth 8 (2.2989%) 136 (1.8476%) 0 (0.0000%)
Unkn 54 (15.5172%) 1388 (18.8561%) 18 (90.0000%)
Wh 188 (54.0230%) 3009 (40.8776%) 1 (5.0000%)
race_g F M U
B_A 65 (3.6131%) 1733 (96.3313%) 1 (0.0556%)
Hisp 33 (2.9255%) 1095 (97.0745%) 0 (0.0000%)
Oth 8 (5.5556%) 136 (94.4444%) 0 (0.0000%)
Unkn 54 (3.6986%) 1388 (95.0685%) 18 (1.2329%)
Wh 188 (5.8787%) 3009 (94.0901%) 1 (0.0313%)
Confusion Matrix and Statistics
Reference
Prediction B_A Wh
B_A 36 143
Wh 34 285
Accuracy : 0.6446
95% CI : (0.6008, 0.6867)
No Information Rate : 0.8594
P-Value [Acc > NIR] : 1
Kappa : 0.1091
Mcnemar's Test P-Value : 4.748e-16
Sensitivity : 0.51429
Specificity : 0.66589
Pos Pred Value : 0.20112
Neg Pred Value : 0.89342
Prevalence : 0.14056
Detection Rate : 0.07229
Detection Prevalence : 0.35944
Balanced Accuracy : 0.59009
'Positive' Class : B_A
d_tree <- final_data_set %>%
dplyr::select(age, gender,race_g) %>%
filter(race_g %in% c("B_A","Wh"))
set.seed(12)
indexes = createDataPartition(d_tree$race_g, p = .9, list = F)
train = d_tree[indexes, ]
test = d_tree[-indexes, ]
##train data set
tmodel = ctree(formula=race_g~., data = train)
pred = predict(tmodel, test[,-3])
test$race_g<-factor(test$race_g)
cm <- confusionMatrix(test$race_g, pred)
#print(cm)---
title: "USA Police shootings 2015 to 2022"
output:
flexdashboard::flex_dashboard:
orientation: columns
vertical_layout: fill
# theme: sandstone
css: fd.css
storyboard: false
social: menu
source: embed
keep_tex: yes
---
```{r setup, include=FALSE}
library(flexdashboard)
library(tidyverse)
library(plotly)
library(dplyr)
library(lubridate)
library(plotly)
library(ggplot2)
library(modelr)
library(corrplot)
library(corrgram)
library(janitor)
library(DT)
library(knitr)
library(formattable)
library(rmarkdown)
library(sqldf)
library(party)
library(partykit)
library(caret)
library(packrat)
library(rsconnect)
data_set <- read_csv(file = "C:/Reference Material/Machine_Learning/USA_PS_CSV.csv")
str(data_set) ## 7729 obs
#data_set <- na.omit(data_set) ## 6574 obs
## CHECK FOR NA'S AND THEIR IMPACT
map_int(data_set, ~sum(is.na(.)))
# PREFERRED METHOD
as_tibble(map(data_set, ~ sum(is.na(.)))) %>%
print(width = Inf, n= Inf)
## USING THE ABOVE METHOD WE SEE THAT ALL THE NA's do not impact any numeric values. So, we keep NAs
# NOW WE WANT TO SEE ANY ROWS WITH NA - SEE BELOW METHOD
data_set %>%
filter(if_any(everything(), is.na)) %>%
print(width =Inf)
#OR
ds <-data_set %>%
filter_all(any_vars(is.na(.))) %>%
print(width =Inf,n = Inf)
##WRITE NA DATA TO CSV
write.csv(as_tibble(ds), file = "C:/Reference Material/Machine_Learning/USA_POLICE_SHOOTINGS_DATASET_NAs.csv",
row.names = FALSE)
## NOW WE ARE HAPPY WIYH THE DATA LETS SELECT COLUMNS WE WILL NEED
final_data_set <- data_set %>%
select(everything(),- c(id,name,city,is_geocoding_exact))
##RENAME LONG COLUMN NAME
final_data_set <- final_data_set %>%
rename("smi" = "signs_of_mental_illness")
str(final_data_set)
## CONVERT CHR TO FACTORS WHERE REQUIRED
final_data_set <-final_data_set %>%
mutate_at(vars(manner_of_death,armed,gender,race,dv,state,threat_level,flee), factor)
## CONVERT DATE CHR TO DATE FORMAT
final_data_set$date <- as.Date(final_data_set$date,format="%d/%m/%Y")
## CONVERT TRUE FALSE TO 0 AND 1
final_data_set$smi <- as.integer(as.logical(final_data_set$smi))
final_data_set$body_camera <- as.integer(as.logical(final_data_set$body_camera))
##GET WDAY FROM DATE AS EXTRA COLUMN
final_data_set <- final_data_set %>%
mutate(wday = wday(date, label = TRUE))
## ADD COLUMN INTO RACEGROUPING FOR WEIGHTING STATISTICS
final_data_set <- final_data_set %>%
mutate(race_g = as.factor(case_when(race %in% c('A','B') ~ 'B_A',
race %in% c('N','O') ~ 'Oth',
race == 'W' ~ 'Wh',
race == 'H' ~ 'Hisp', TRUE ~ 'Unkn'))
)
## NOW ADD PERCENT OF USA POPULATION GY RACE GROUPING
final_data_set <- final_data_set %>%
mutate(race_perc = as.double(case_when(race_g == 'B_A' ~ 12.6,
race_g == 'Oth' ~ 8.5,
race_g == 'Wh' ~ 62.0,
race_g == 'Hisp' ~ 16.9, TRUE ~ 0.0))
)
str(final_data_set)
final_data_set <- as_tibble(final_data_set)
str(final_data_set)
glimpse(final_data_set)
```
# Dashboard Summary & Index
##Column {data-width=650}
-----------------------------------------------------------------------
### Summary Notes
* **The data set for this dashboard was downloaded from the worldwide web.**
**This dashboard is for demonstration purposes only and the numbers can not be confirmed as factual.**
* **The dashboard sets out to highlight stats and graphs representing fatalities due to USA police officer shootings - no names or addresses are contained within this dashboard.**
* **We also need to take into account when looking at the numbers the percentage split by race in the USA varies, and in some cases, for a true and fair reflection the numbers would need to be weighted according the the percentage of population made up by different race groups - see section 5 in index for clarification**
**For reference the race split in the US is as follows:**
* White: 62.0% Wh
* Hispanic: 16.9% Hisp
* Black_African 12.6% B_A
* Other 8.5% Oth
**Other notes regarding the dashboard**
* Hover mouse over dynamic graphs to see the data numbers and categories
* If you click on a graph legend then that category will be removed from the chart
* Click legend again to bring back that category
* The Decision tree has been created by using a machine learning algorithm. The tree shows where the most likely outcome (FATALITY) percentage wise, by age - gender - race
**Finally - if you are interested in seeing the source code that drives this dashboard then click on the "source code" tab on the far right of the dashboard.**
```{r}
```
##Column {data-width=650}
-----------------------------------------------------------------------
### Index
1. First item on our dashboard (Data Table) is the data set downloaded from the web
* This is a visual aid only and what is shown are 15 rows of data from total rows of 7,729
2. Second item on our dashboard is a map (USA Map) & Pie Chart highlighting the following:-
* Number of fatalities by state
* Number of fatalities by race and gender
* State & Gender fatalities shown in Pie Chart
3. Third item on our dashboard (boxplot and histogram) are stats for the following:-
* Boxplot stats for "signs of mental illness, age and race"
* Histogram for age & gender
* Histogram for age only
4. Fourth item on our dashboard is "category stats for:
* Race, Gender & Armed showing totals and % of totals
* Bar graph showing number split between gender on category "armed."
5. Fith item on our dashboard is "weighting" of numbers re "race:
* Calculations as to how we arrived at weighted numbers
* Bar chart showing bias toward race 'B_A'
* Grid showing calculation results
6. Sixth item on our dashboard is bar chart showing "fatalities by year and race"
* Fatalities by grid [date,weekday,fatalities]
* Plus sidebar showing gender-race % by all, column and row
7. Seventh item on our dashboard is "Decision Tree"
* Decision Tree
* Confusion matrix
* Algorithm for confusion matrix
```{r}
```
# Data Table
##Column {data-width=650}
-----------------------------------------------------------------------
### Data Table (15 rows taken from a total of 7,729)
```{r,layout="l-body-outset"}
tail(final_data_set,15) %>%
kable()
```
# USA Map
##Column {data-width=650}
-----------------------------------------------------------------------
### USA Map 2015 to 2022 - USA Police shootings proven fatal
```{r}
map <- final_data_set %>%
group_by(state) %>%
dplyr::summarise(mean_age = mean(age),
count = n(),
black_african = sum(number[race_g == 'B_A']),
white = sum(number[race_g == 'Wh']),
hispanic = sum(number[race_g == 'Hisp']),
other = sum(number[race_g == 'Oth']),
female = sum(number[gender == 'F']),
male = sum(number[gender == 'M']),
unknown = sum(number[race_g == 'Unkn'])
)
map$hover <- with(map, paste("Fatalities in the state of..",state, '<br>', '<br>',
"Mean age:", round(mean_age,2), '<br>',
'<br>',
"Deaths by Race:-", '<br>',
"Black-African:", black_african,'<br>',
"White:", white ,'<br>',
"Hispanic:", hispanic ,'<br>',
"Other race:", other,'<br>',
"Unknown race:", unknown,'<br>',
'<br>',
"Deaths by Gender:-", '<br>',
"Male:" , male ,'<br>',
"Female", female
))
# give state boundaries a white border
l <- list(color = toRGB("white"), width = 2)
# specify some map projection/options
g <- list(
scope = 'usa',
projection = list(type = 'albers usa'),
showlakes = TRUE,
lakecolor = toRGB('white')
)
fig <- plot_geo(map, locationmode = 'USA-states')
fig <- fig %>% add_trace(
z = ~count, text = ~hover, locations = ~state,
color = ~count, colors = 'Reds'
)
fig <- fig %>% colorbar(title = "Number of fatalities")
fig <- fig %>% layout(
title = '',
geo = g
)
fig%>%
config(displayModeBar = FALSE)
```
## Column {data-width=350}
-----------------------------------------------------------------------
### Pie Chart (States & Gender - fatalities)
```{r}
us_pf <-data_set %>%
na.omit()
sf <- sqldf(" select (state ||'..'|| gender) as state_gender, sum(number) as amt
from us_pf
group by (state ||'..'|| gender)
"
)
colors <- c('rgb(211,94,96)', 'rgb(128,133,133)', 'rgb(144,103,167)', 'rgb(171,104,87)', 'rgb(114,147,203)')
figz <- plot_ly(sf, labels = ~state_gender, values = ~amt, type = 'pie',
textposition = 'inside',
textinfo = 'label+percent',
insidetextfont = list(color = '#FFFFFF'),
hoverinfo = 'text',
text = ~paste('State..Gender', state_gender,'<br>','Data:', amt, ' Fatalities'),
marker = list(colors = colors,
line = list(color = '#FFFFFF', width = 1)),
#The 'pull' attribute can also be used to create space between the sectors
showlegend = FALSE)
figz <- figz %>% layout(#title = 'United States fatalities by State and Gender',
xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
figz%>%
config(displayModeBar = FALSE)
```
# Mental illness Stats
##Column {data-width=750}
-----------------------------------------------------------------------
### Boxplot showing stats for "mental illness, age & race by YEAR
```{r}
bp <-final_data_set %>%
filter(gender != 'U') %>%
ggplot()+
geom_boxplot(mapping = aes(x = reorder(race_g,age),y = age,color = race_g)) +
labs(y="Age & by Signs of mental illness (1 =True, 0 = False)", x="Race")+
theme(axis.text.x=element_text(size=5))+
facet_grid(smi~year, scales="free", space="free_x")
ggplotly(bp)%>%
config(displayModeBar = FALSE)
```
## Column {data-width=300}
-----------------------------------------------------------------------
### Histogram of race and age
```{r}
his_age <- final_data_set %>%
select(race_g,age) %>%
filter(race_g != 'Unkn')
his_age%>%
group_by(race_g) %>%
do(p=plot_ly(., x = ~age,name =~race_g, #nbinsx = 5,
type = "histogram")) %>%
subplot(nrows = 2, shareX = TRUE, shareY = TRUE)%>%
config(displayModeBar = FALSE)
```
### Histogram of age
```{r, echo=FALSE}
hist1 <- ggplot(data = his_age,mapping = aes(x = age), color = age)+
geom_histogram(binwidth = 5)+
xlab("Bin width = 5") +
ylab("Count of age within bin")+
theme_gray()
ggplotly(hist1)%>%
config(displayModeBar = FALSE)
```
# Armed & Category stats
##Column {data-width=600}
-----------------------------------------------------------------------
### Race-Gender-Armed stats (Pie Chart)
```{r}
figa <- plot_ly()
figa <- figa %>% add_pie(data = count(final_data_set, gender), labels = ~gender, values = ~n,
name = "Gender", domain = list(x = c(0, 0.4), y = c(0.4, 1)))
figa <- figa %>% add_pie(data = count(final_data_set), labels = ~final_data_set$race_g, values = ~n,
name = "Race", domain = list(x = c(0.6, 1), y = c(0.4, 1)))
figa <- figa %>% add_pie(data = count(final_data_set, armed), labels = ~armed, values = ~n,
name = "Armed", domain = list(x = c(0.25, 0.75), y = c(0, 0.6)))
figa <- figa %>% layout(title = "", showlegend = F,
xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
figa%>%
config(displayModeBar = FALSE)
```
## Column {data-width=400}
-----------------------------------------------------------------------
### Armed numbers split by gender (numbers >= 20 )
```{r}
df<-final_data_set %>%
select(gender,armed) %>%
group_by(gender,armed) %>%
summarise(n = n()) %>%
mutate(n = case_when(gender == 'M' ~ n,
TRUE ~ -n))
##NOW TO LIMIT DATA IN ARMED BY NUMBERS > 20
df2<-final_data_set %>%
group_by(armed) %>%
summarise(n = n()) %>%
filter(n>=20) %>%
select(armed)
df <-df %>%
inner_join(df2, by = "armed")
##########################################
brks <- c(seq(-500, 5000, by = 500))
##lbls = c(seq(15, 0, -5), seq(5, 15, 5))
p <- df %>%
ggplot(aes(x = reorder(armed,abs(n)), y = n, fill = gender)) +
geom_bar(stat = "identity", width = .6) +
scale_y_continuous(breaks = brks) +
coord_flip() +
theme_gray() +
labs(title="") +
theme(plot.title = element_text(hjust = .5),
axis.ticks = element_blank())+
xlab("") +
ylab("Number of Casualties")
ggplotly(p)%>%
config(displayModeBar = FALSE)
```
# Race weighted numbers
##Column {data-width=600}
-----------------------------------------------------------------------
### Weighted calculations for grid and bar chart
```{r, echo=TRUE}
weight<- final_data_set %>%
group_by(number) %>%
summarize(n = n())
weight2<- final_data_set %>%
group_by(race_g) %>%
summarize(tot_by_race = n())
final_weighting <-weight2 %>%
select(race_g, tot_by_race) %>%
mutate(race_perc = as.double(case_when(race_g == 'B_A' ~ 12.6, ## Blacks & Africans 12.6 % of total pop
race_g == 'Oth' ~ 8.5, ## etc
race_g == 'Wh' ~ 62.0,
race_g == 'Hisp' ~ 16.9, TRUE ~ 0.0))) %>%
mutate(tot = weight$n) %>%
mutate(pot = (tot_by_race/tot)*100) %>%
mutate(w_t = race_perc/pot) %>% ## weighted % = race_perc / pot(% of total)
mutate(wtnor = tot_by_race * w_t) %>% ## weighted number of race!
mutate(perc_diff = ((tot_by_race/wtnor)*100)-100)
#final_weighting %>%
#kable()
```
## Column {data-width=400}
-----------------------------------------------------------------------
### Chart showing bias that 'B_A' race is targeted more
```{r}
plot_ly(final_weighting, type = "bar", x=final_weighting$perc_diff,
y=final_weighting$race_g, group=final_weighting$race_g, orientation="h",
marker = list(color = c('rgba(222,45,38,0.8)', 'rgba(204,204,204,1)',
'rgba(204,204,204,1)', 'rgba(204,204,204,1)',
'rgba(204,204,204,1)')))%>%
config(displayModeBar = FALSE)
```
### Grid showing calculation results
```{r}
weight<- final_data_set %>%
group_by(number) %>%
summarize(n = n())
weight2<- final_data_set %>%
group_by(race_g) %>%
summarize(tot_by_race = n())
final_weighting <-weight2 %>%
select(race_g, tot_by_race) %>%
mutate(race_perc = as.double(case_when(race_g == 'B_A' ~ 12.6, ## Blacks & Africans 12.6 % of total pop
race_g == 'Oth' ~ 8.5, ## etc
race_g == 'Wh' ~ 62.0,
race_g == 'Hisp' ~ 16.9, TRUE ~ 0.0))) %>%
mutate(tot = weight$n) %>%
mutate(pot = (tot_by_race/tot)*100) %>%
mutate(w_t = race_perc/pot) %>% ## weighted % = race_perc / pot(% of total)
mutate(wtnor = tot_by_race * w_t) %>% ## weighted number of race!
mutate(perc_diff = ((tot_by_race/wtnor)*100)-100)
final_weighting %>%
kable()
```
# Fatalities by year
## Row {.tabset .tabset-fade}
-----------------------------------------------------------------------
### Bar chart showing fatalities by year and race
```{r}
final_data_set$race_g <- factor(final_data_set$race_g , levels=c('B_A','Hisp','Oth','Unkn','Wh'))
p12 <-ggplot(data = final_data_set) +
geom_bar(mapping = aes(x = year, fill = race_g), color = "black") +
theme_classic()+
#labs(title = "Fatalities by Year and by Race", x = "Year", y = "Number of Fatalities") +
coord_flip()
ggplotly(p12)%>%
config(displayModeBar = FALSE)
```
### Fatalities by grid [date,weekday,fatalities]
```{r}
date_wday <- final_data_set %>%
group_by(date) %>%
summarize(n = n())
date_wday <- date_wday %>%
mutate(wday = wday(date, label = TRUE)) %>%
mutate(month = month(date, label = TRUE)) %>%
mutate(year = as.factor(as.integer(format(ymd(date), "%Y"))))
g1 <- date_wday %>%
#filter( year == 2019) %>%
ggplot(aes(date,n, color = wday))+
geom_point( size = 0.6)+
geom_line( size = 0.3)+
scale_x_date(
NULL,
date_breaks = "1 year"
)+ facet_grid(wday~., scales="free", space="free_x")+
theme(plot.background = element_rect(fill = "grey90"))+
theme(axis.line = element_line(linewidth = 3, colour = "red"))+
theme(
panel.background = element_rect(fill = "white")) +
labs(title = "Deaths over time by weekday", x = "", y = "number of deaths")
ggplotly(g1)%>%
config(displayModeBar = FALSE)
```
## Column {data-width=400}
-----------------------------------------------------------------------
### Gender percentages by race - all %
```{r}
g1 <- final_data_set %>%
select(race_g,gender,year)
##
## GENDER RACE %
gr <-g1 %>%
tabyl(race_g,gender) %>%
adorn_percentages("all") %>% ## CAN BE col, row, all
adorn_pct_formatting(digits = 4) %>%
adorn_ns("front")
print(gr)
```
### Gender percentages by race - column %
```{r}
gr <-g1 %>%
tabyl(race_g,gender) %>%
adorn_percentages("col") %>% ## CAN BE col, row, all
adorn_pct_formatting(digits = 4) %>%
adorn_ns("front")
print(gr)
```
### Gender percentages by race - row %
```{r, echo=FALSE}
gr <-g1 %>%
tabyl(race_g,gender) %>%
adorn_percentages("row") %>% ## CAN BE col, row, all
adorn_pct_formatting(digits = 4) %>%
adorn_ns("front")
print(gr)
```
# Decision Tree
##Column {data-width=700}
-----------------------------------------------------------------------
### Decision tree by age, gender and race (White & Non White)
```{r, echo=FALSE}
d_tree <- final_data_set %>%
select(age, gender,dv)
pd <- sample(2,nrow(d_tree),replace = TRUE, prob = c(0.8,0.2))
train <- d_tree[pd == 1,]
validate <- d_tree[pd == 2,]
tree <- ctree(dv ~., data = d_tree
, maxdepth = 5, alpha = 0.5
# , controls = ctree_control(mincriterion = 0.9, minsplit = 50)
)
plot(tree, gp = gpar(fontsize = 7),inner_panel=node_inner(tree,pval = FALSE, id = FALSE, fill = "orange"), terminal_panel = node_barplot(tree,id = FALSE),
ip_args = list(id = FALSE, fill = "red"),
ep_args = list(fill = "yellow"))
```
## Row {.tabset .tabset-fade}
-----------------------------------------------------------------------
### Confusion Matrix output when testing the "test" data set
```{r, echo=FALSE}
d_tree <- final_data_set %>%
dplyr::select(age, gender,race_g) %>%
filter(race_g %in% c("B_A","Wh"))
set.seed(12)
indexes = createDataPartition(d_tree$race_g, p = .9, list = F)
train = d_tree[indexes, ]
test = d_tree[-indexes, ]
##train data set
tmodel = ctree(formula=race_g~., data = train)
pred = predict(tmodel, test[,-3])
test$race_g<-factor(test$race_g)
cm <- confusionMatrix(test$race_g, pred)
print(cm)
```
### Algorithm for confusion matrix
```{r, echo=TRUE}
d_tree <- final_data_set %>%
dplyr::select(age, gender,race_g) %>%
filter(race_g %in% c("B_A","Wh"))
set.seed(12)
indexes = createDataPartition(d_tree$race_g, p = .9, list = F)
train = d_tree[indexes, ]
test = d_tree[-indexes, ]
##train data set
tmodel = ctree(formula=race_g~., data = train)
pred = predict(tmodel, test[,-3])
test$race_g<-factor(test$race_g)
cm <- confusionMatrix(test$race_g, pred)
#print(cm)
```