MATH1324 Applied Analytics Assignment 2

Hate crime in England and Wales from 2020 to 2021 Using Two-Sample T-Test

Siddharth Vangari s3894864

Last updated: 27 May, 2023

Introduction

Introduction Cont.

Problem Statement

Data

Data Cont.

Using the read.csv() function, the data set was imported into R. Using the head() function, the data set was organized.

Hatecrime <- read.csv("C:/Users/vanga/OneDrive/Desktop/rmit/Sem 3/Applied Analytics/Final Assignment/hate_crime_data.csv")
knitr::kable(head(Hatecrime, 10))
Financial.Year Force.Name Motivating.factor Number.of.offences
2011/12 Avon and Somerset Disability 113
2011/12 Bedfordshire Disability 6
2011/12 British Transport Police Disability 25
2011/12 Cambridgeshire Disability 6
2011/12 Cheshire Disability 7
2011/12 Cleveland Disability 15
2011/12 Cumbria Disability 17
2011/12 Derbyshire Disability 12
2011/12 Devon and Cornwall Disability 7
2011/12 Dorset Disability 9

Preprocessing

colnames(Hatecrime) <- gsub("\\.", "_", colnames(Hatecrime))
head(Hatecrime, 5)
Hatecrime$Number_of_offences <- as.numeric(gsub(",", "", Hatecrime$Number_of_offences))
Hatecrimefiltered <- Hatecrime[Hatecrime$Motivating_factor %in% c("Race", "Religion"), ]
Hatecrimefiltered[] <- lapply(Hatecrimefiltered, as.factor)
Hatecrimefiltered$Number_of_offences <- as.numeric(as.character(Hatecrimefiltered$Number_of_offences))
Hatecrimefiltered <- Hatecrimefiltered[complete.cases(Hatecrimefiltered$Number_of_offences), ]

Preprocessing Cont.

knitr::kable(head(Hatecrimefiltered,10))
Financial_Year Force_Name Motivating_factor Number_of_offences
45 2011/12 Avon and Somerset Race 1241
46 2011/12 Bedfordshire Race 266
47 2011/12 British Transport Police Race 1349
48 2011/12 Cambridgeshire Race 338
49 2011/12 Cheshire Race 293
50 2011/12 Cleveland Race 307
51 2011/12 Cumbria Race 194
52 2011/12 Derbyshire Race 440
53 2011/12 Devon and Cornwall Race 737
54 2011/12 Dorset Race 226

Descriptive Statistics and Visualisation

Hatecrimefiltered %>% group_by(Motivating_factor) %>% summarise(Min = min(Number_of_offences,na.rm = TRUE),
                                                                Q1 = quantile(Number_of_offences,probs = .25,na.rm = TRUE),
                                                                Median = median(Number_of_offences, na.rm = TRUE),
                                                                Q3 = round(quantile(Number_of_offences,probs = .75,na.rm = TRUE),1),
                                                                Max = max(Number_of_offences,na.rm = TRUE),
                                                                Mean = round(mean(Number_of_offences, na.rm = TRUE),1),
                                                                SD = round(sd(Number_of_offences, na.rm = TRUE),1),
                                                                n = n(),
                                                                Missing = sum(is.na(Number_of_offences))) -> crime_by_Motivating_Factor
knitr::kable(crime_by_Motivating_Factor)
Motivating_factor Min Q1 Median Q3 Max Mean SD n Missing
Race 51 418.5 744 1437.0 21938 1433.9 2435.3 483 0
Religion 0 19.0 43 105.5 2506 120.0 290.4 483 0

Decsriptive Statistics Cont.

# Create histogram plots
plot_Race <- ggplot(Hatecrimefiltered[Hatecrimefiltered$Motivating_factor == "Race", ], aes(x = Number_of_offences)) +
  geom_histogram(fill = "blue", color = "white", bins = 10) +
  labs(x = "Number of Offences", y = "Frequency") +
  ggtitle("Histogram of Crimes Due to Race")

plot_Religion <- ggplot(Hatecrimefiltered[Hatecrimefiltered$Motivating_factor == "Religion", ], aes(x = Number_of_offences)) +
  geom_histogram(fill = "red", color = "white", bins = 10) +
  labs(x = "Number of Offences", y = "Frequency") +
  ggtitle("Histogram of Crimes Due to Religion")

# Arrange the histograms side by side
grid.arrange(plot_Race, plot_Religion, nrow = 1)

Visual Representation of Outliers

# Calculate the quartiles and IQR
Q1 <- quantile(Hatecrimefiltered$Number_of_offences, 0.25)
Q3 <- quantile(Hatecrimefiltered$Number_of_offences, 0.75)
IQR <- Q3 - Q1

# Define the lower and upper bounds for outliers
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR

# Remove outliers from the data
HatecrimeCleaned <- Hatecrimefiltered[Hatecrimefiltered$Number_of_offences >= lower_bound & Hatecrimefiltered$Number_of_offences <= upper_bound, ]

# Print the filtered data
head(HatecrimeCleaned)
is.outlier = function(x){(x < summary(x)[2] - 1.5*IQR(x))|(x > summary(x)[5] + 1.5*IQR(x))}
sum(is.outlier(HatecrimeCleaned$Number_of_offences))
## [1] 45
boxplot(Number_of_offences~Motivating_factor, data = HatecrimeCleaned, xlab = "Motivating Factor",
        ylab = "Number of Offences", main = "Number Of Offences Motivated By Race To Religion", col=c("blue", "pink"))

Hypothesis Testing

H0: There is no statistically significant difference between the average number of offences motivated by Race and Religion.

\[H_0: \mu_1 = \mu_2 \] HA: There is a statistically significant difference between the average number of offences motivated by Race and religion.

\[H_A: \mu_1 \ne \mu_2\]

Hypthesis Testing - QQ Plot

# Filter the data for offences motivated by race
raceData <- HatecrimeCleaned$Number_of_offences[HatecrimeCleaned$Motivating_factor == "Race"]

qqnorm(raceData)
qqline(raceData)

#title(main = "QQ Plot for Number of Offences Motivated by Race")
# Filter the data for offences motivated by race
religionData <- HatecrimeCleaned$Number_of_offences[HatecrimeCleaned$Motivating_factor == "Religion"]

qqnorm(religionData)
qqline(religionData)

#title(main = "QQ Plot for Number of Offences Motivated by Race")

Homogeneity of Variance - Levene’s Test

# Levene's Test of Equal Variance

leveneTest(Number_of_offences~Motivating_factor, data = HatecrimeCleaned) %>% as.data.frame()

Hypthesis Testing - Two-sample t-test

t.test(
  Number_of_offences~Motivating_factor, 
  data = HatecrimeCleaned,
  var.equal = FALSE,
  alternative = "two.sided"
)
## 
##  Welch Two Sample t-test
## 
## data:  Number_of_offences by Motivating_factor
## t = 25.194, df = 512.64, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group Race and group Religion is not equal to 0
## 95 percent confidence interval:
##  574.9778 672.2357
## sample estimates:
##     mean in group Race mean in group Religion 
##              721.18204               97.57531
# Difference between the two means

721.18204 - 97.57531
## [1] 623.6067

Hypthesis Testing - Interpretation

Discussion

References