rm(list = ls())
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
data <- read.csv("Final_Dataset.csv")
summary(data)
##   Iso3_code           Country             Region           Subregion        
##  Length:113634      Length:113634      Length:113634      Length:113634     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##   Indicator          Dimension           Category             Sex           
##  Length:113634      Length:113634      Length:113634      Length:113634     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##      Age                 Year      Unit.of.measurement     VALUE         
##  Length:113634      Min.   :1990   Length:113634       Min.   :     0.0  
##  Class :character   1st Qu.:2012   Class :character    1st Qu.:     0.3  
##  Mode  :character   Median :2016   Mode  :character    Median :     2.6  
##                     Mean   :2015                       Mean   :   726.5  
##                     3rd Qu.:2019                       3rd Qu.:    21.0  
##                     Max.   :2022                       Max.   :457945.5  
##                                                                          
##       GDP            Unemployment.Rate religionByCountry_christians
##  Min.   :    22.85   Min.   : 0.100    Min.   :0.000               
##  1st Qu.:  5826.83   1st Qu.: 4.560    1st Qu.:0.727               
##  Median : 14294.26   Median : 6.760    Median :0.918               
##  Mean   : 23256.25   Mean   : 8.026    Mean   :0.769               
##  3rd Qu.: 33673.75   3rd Qu.:10.030    3rd Qu.:0.979               
##  Max.   :204097.11   Max.   :37.320    Max.   :0.999               
##  NA's   :8400        NA's   :15427     NA's   :20061               
##  religionByCountry_muslims religionByCountry_unaffiliated
##  Min.   :0.000             Min.   :0.000                 
##  1st Qu.:0.002             1st Qu.:0.028                 
##  Median :0.027             Median :0.071                 
##  Mean   :0.108             Mean   :0.158                 
##  3rd Qu.:0.063             3rd Qu.:0.220                 
##  Max.   :0.999             Max.   :1.404                 
##  NA's   :20061             NA's   :20061                 
##  religionByCountry_hindus religionByCountry_buddhists
##  Min.   :0.000            Min.   :0.000              
##  1st Qu.:0.001            1st Qu.:0.001              
##  Median :0.002            Median :0.002              
##  Mean   :0.027            Mean   :0.046              
##  3rd Qu.:0.015            3rd Qu.:0.011              
##  Max.   :0.809            Max.   :0.970              
##  NA's   :20061            NA's   :20061              
##  religionByCountry_folkReligions  Corona.1.Yes      Immigrants      
##  Min.   :0.000                   Min.   :0.0000   Min.   :    7412  
##  1st Qu.:0.001                   1st Qu.:0.0000   1st Qu.:  863518  
##  Median :0.005                   Median :0.0000   Median : 2477230  
##  Mean   :0.013                   Mean   :0.1753   Mean   : 7672933  
##  3rd Qu.:0.015                   3rd Qu.:0.0000   3rd Qu.: 8853505  
##  Max.   :0.459                   Max.   :1.0000   Max.   :52593868  
##  NA's   :20061                                    NA's   :79405
clean_data <- data %>% filter(Sex == "Female")
clean_data <- clean_data %>% select(-Iso3_code)
clean_data <- clean_data %>% filter(Unit.of.measurement != "Counts")
#

religion_data <- clean_data %>% select(-c(Region,Corona.1.Yes, Immigrants,Subregion, Dimension, Category, Sex, Age, Year, GDP, Unit.of.measurement, Indicator, Unemployment.Rate))

religion_data <- na.omit(religion_data)

religion_data <- religion_data %>% group_by(Country) %>% summarise(across(everything(), mean))

manipulating_data  <- religion_data  %>% select(-VALUE)





religion_data2 <- manipulating_data %>% pivot_longer(cols = !Country,  names_to = "Religion", values_to = "Fraction")

religion_data3 <- religion_data2 %>% group_by(Country) %>% summarise(Fraction = max(Fraction))

final_religion <- inner_join(religion_data2, religion_data3, by = c("Fraction", "Country"))

final_religion <- inner_join(final_religion, religion_data %>% select(Country, VALUE), by = "Country")



set.seed(210191)

split <- 0.75
rows  <- nrow(final_religion)

train.entries <- sample(rows, rows*split)

data.train <- final_religion[train.entries, ]
data.valid  <- final_religion[-train.entries,  ]

model <- lm(VALUE ~ Religion * Fraction, data=data.train)    

model <- step(model, trace=0)

data.train <- data.train %>%
  mutate(yhat = predict(model, newdata=data.train)) %>%
  mutate(residual = VALUE - yhat)

print(sum(is.na(data.train$residual)))
## [1] 0
data.valid <- data.valid %>%
  mutate(yhat = predict(model, newdata=data.valid)) %>%
  mutate(residual = VALUE - yhat)

print(sum(is.na(data.valid$residual)))
## [1] 0
sd(data.train$residual)
## [1] 1.607206
sd(data.valid$residual)
## [1] 1.906245
ggplot(data.valid) +
  geom_jitter(aes(x=yhat, y=residual)) +
  geom_hline(aes(yintercept=0), linetype="dashed", color="red") +
  ylim(-5,5) +
  xlab("Ratio of Religious Status") +
  ylab("Female Deaths per 100,000 population") +
  ggtitle("Religions Among Countries vs Femicides")+
  theme(plot.title = element_text(hjust = 0.5))
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

#ggplot(final_religion) +
#  geom_point(aes(x=Fraction, y=VALUE, x ="Religion", y="Deaths")) 


ggplot(final_religion, aes(x=Fraction, y=VALUE, color=Religion)) +
  geom_point()+
  geom_smooth(method=lm, se=FALSE, fullrange=TRUE)+
  xlab("Ratio of Religious Status") +
  ylab("Female Deaths per 100,000 population") +
  ggtitle("Dominant Religious Standings vs Femicides") +
  ylim(0,10) +
  theme(plot.title = element_text(hjust = 0.5))
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 36 rows containing missing values or values outside the scale range
## (`geom_smooth()`).

ggplot(final_religion, aes(x=Fraction, y=VALUE, color=Religion)) +
  geom_point()+
  xlab("Ratio of Religious Status") +
  ylab("Female Deaths per 100,000 population") +
  ggtitle("Dominant Religions Among Countries vs Femicides") +
  ylim(0,10) +
  theme(plot.title = element_text(hjust = 0.5))