rm(list = ls())
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
data <- read.csv("Final_Dataset.csv")
summary(data)
## Iso3_code Country Region Subregion
## Length:113634 Length:113634 Length:113634 Length:113634
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Indicator Dimension Category Sex
## Length:113634 Length:113634 Length:113634 Length:113634
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Age Year Unit.of.measurement VALUE
## Length:113634 Min. :1990 Length:113634 Min. : 0.0
## Class :character 1st Qu.:2012 Class :character 1st Qu.: 0.3
## Mode :character Median :2016 Mode :character Median : 2.6
## Mean :2015 Mean : 726.5
## 3rd Qu.:2019 3rd Qu.: 21.0
## Max. :2022 Max. :457945.5
##
## GDP Unemployment.Rate religionByCountry_christians
## Min. : 22.85 Min. : 0.100 Min. :0.000
## 1st Qu.: 5826.83 1st Qu.: 4.560 1st Qu.:0.727
## Median : 14294.26 Median : 6.760 Median :0.918
## Mean : 23256.25 Mean : 8.026 Mean :0.769
## 3rd Qu.: 33673.75 3rd Qu.:10.030 3rd Qu.:0.979
## Max. :204097.11 Max. :37.320 Max. :0.999
## NA's :8400 NA's :15427 NA's :20061
## religionByCountry_muslims religionByCountry_unaffiliated
## Min. :0.000 Min. :0.000
## 1st Qu.:0.002 1st Qu.:0.028
## Median :0.027 Median :0.071
## Mean :0.108 Mean :0.158
## 3rd Qu.:0.063 3rd Qu.:0.220
## Max. :0.999 Max. :1.404
## NA's :20061 NA's :20061
## religionByCountry_hindus religionByCountry_buddhists
## Min. :0.000 Min. :0.000
## 1st Qu.:0.001 1st Qu.:0.001
## Median :0.002 Median :0.002
## Mean :0.027 Mean :0.046
## 3rd Qu.:0.015 3rd Qu.:0.011
## Max. :0.809 Max. :0.970
## NA's :20061 NA's :20061
## religionByCountry_folkReligions Corona.1.Yes Immigrants
## Min. :0.000 Min. :0.0000 Min. : 7412
## 1st Qu.:0.001 1st Qu.:0.0000 1st Qu.: 863518
## Median :0.005 Median :0.0000 Median : 2477230
## Mean :0.013 Mean :0.1753 Mean : 7672933
## 3rd Qu.:0.015 3rd Qu.:0.0000 3rd Qu.: 8853505
## Max. :0.459 Max. :1.0000 Max. :52593868
## NA's :20061 NA's :79405
clean_data <- data %>% filter(Sex == "Female")
clean_data <- clean_data %>% select(-Iso3_code)
clean_data <- clean_data %>% filter(Unit.of.measurement != "Counts")
#
religion_data <- clean_data %>% select(-c(Region,Corona.1.Yes, Immigrants,Subregion, Dimension, Category, Sex, Age, Year, GDP, Unit.of.measurement, Indicator, Unemployment.Rate))
religion_data <- na.omit(religion_data)
religion_data <- religion_data %>% group_by(Country) %>% summarise(across(everything(), mean))
manipulating_data <- religion_data %>% select(-VALUE)
religion_data2 <- manipulating_data %>% pivot_longer(cols = !Country, names_to = "Religion", values_to = "Fraction")
religion_data3 <- religion_data2 %>% group_by(Country) %>% summarise(Fraction = max(Fraction))
final_religion <- inner_join(religion_data2, religion_data3, by = c("Fraction", "Country"))
final_religion <- inner_join(final_religion, religion_data %>% select(Country, VALUE), by = "Country")
set.seed(210191)
split <- 0.75
rows <- nrow(final_religion)
train.entries <- sample(rows, rows*split)
data.train <- final_religion[train.entries, ]
data.valid <- final_religion[-train.entries, ]
model <- lm(VALUE ~ Religion * Fraction, data=data.train)
model <- step(model, trace=0)
data.train <- data.train %>%
mutate(yhat = predict(model, newdata=data.train)) %>%
mutate(residual = VALUE - yhat)
print(sum(is.na(data.train$residual)))
## [1] 0
data.valid <- data.valid %>%
mutate(yhat = predict(model, newdata=data.valid)) %>%
mutate(residual = VALUE - yhat)
print(sum(is.na(data.valid$residual)))
## [1] 0
sd(data.train$residual)
## [1] 1.607206
sd(data.valid$residual)
## [1] 1.906245
ggplot(data.valid) +
geom_jitter(aes(x=yhat, y=residual)) +
geom_hline(aes(yintercept=0), linetype="dashed", color="red") +
ylim(-5,5) +
xlab("Ratio of Religious Status") +
ylab("Female Deaths per 100,000 population") +
ggtitle("Religions Among Countries vs Femicides")+
theme(plot.title = element_text(hjust = 0.5))
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

#ggplot(final_religion) +
# geom_point(aes(x=Fraction, y=VALUE, x ="Religion", y="Deaths"))
ggplot(final_religion, aes(x=Fraction, y=VALUE, color=Religion)) +
geom_point()+
geom_smooth(method=lm, se=FALSE, fullrange=TRUE)+
xlab("Ratio of Religious Status") +
ylab("Female Deaths per 100,000 population") +
ggtitle("Dominant Religious Standings vs Femicides") +
ylim(0,10) +
theme(plot.title = element_text(hjust = 0.5))
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 36 rows containing missing values or values outside the scale range
## (`geom_smooth()`).

ggplot(final_religion, aes(x=Fraction, y=VALUE, color=Religion)) +
geom_point()+
xlab("Ratio of Religious Status") +
ylab("Female Deaths per 100,000 population") +
ggtitle("Dominant Religions Among Countries vs Femicides") +
ylim(0,10) +
theme(plot.title = element_text(hjust = 0.5))
