Dataset: Loksabha 2019 Candidates General Information. (https://www.kaggle.com/datasets/themlphdstudent/lok-sabha-election-candidate-list-2004-to-2019)
# Importing required libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggrepel)
# Loading our dataset
data <-read.csv('C:\\Users\\bhush\\Downloads\\Coursework\\I 590 INTRO TO R\\datasets\\data_final\\LokSabha2019_xl.csv')
Alpha level: 0.05 - Because it can control false positives and finding real results which accept/reject the null hypothesis
Minimum effect size: 5 years - We choose this because I think this age difference can be a meaningful age difference in this context.
Conducting an two sample t-test to find the difference between the average ages
BJP_candidates_age <- data|>
filter(Party == 'BJP')|>
summarise(bjp_avg_age = round(mean(Age),0))
print(BJP_candidates_age)
## bjp_avg_age
## 1 55
AAP_candidates_age <- data|>
filter(Party == 'AAP')|>
summarise(aap_avg_age = round(mean(Age),0))
print(AAP_candidates_age)
## aap_avg_age
## 1 48
age_difference <- (age_diff = (BJP_candidates_age - AAP_candidates_age))
colnames(age_diff)[colnames(age_diff) == "bjp_avg_age"] <- "age_difference"
print(age_diff)
## age_difference
## 1 7
bjp1<- data|>
filter(Party == "BJP")
aap1<- data|>
filter(Party == "AAP")
ggplot() +
geom_jitter(mapping = aes(x = bjp1$Age, y = bjp1$Party),
shape = 1, size = 3, width=0, height=0.1)+
geom_jitter(mapping = aes(x = aap1$Age, y = aap1$Party),
shape = 1, size = 3, width=0, height=0.1)+
geom_point(
mapping = aes(x = BJP_candidates_age$bjp_avg_age, y = bjp1$Party),
shape = "|", size = 12, color = "green")+
geom_point(
mapping = aes(x = AAP_candidates_age$aap_avg_age, y = aap1$Party),
shape = "|", size = 12, color = "green")+
labs(title = "Ages of candidates BJP vs AAP", x = "Age" , y= "Parties")+
theme_classic()
bjp_data <- data|>
filter(Party=="BJP")|>
select(Age)
bjp_data <- bjp_data|>
sample_n(35, replace = FALSE)
aap_data <- data|>
filter(Party=="AAP")|>
select(Age)
data_table <- table(bjp_data$Age, aap_data$Age)
result <- chisq.test(data_table)
## Warning in chisq.test(data_table): Chi-squared approximation may be incorrect
print(result)
##
## Pearson's Chi-squared test
##
## data: data_table
## X-squared = 440.32, df = 456, p-value = 0.6926
Alpha level: 0.05 - Because it can control false positives and finding real results which accept/reject the null hypothesis
We will perform a chi-squared test to determine the p-value
data_table <- table(data$Criminal.Cases, data$Winner)
result <- chisq.test(data_table)
## Warning in chisq.test(data_table): Chi-squared approximation may be incorrect
print(result)
##
## Pearson's Chi-squared test
##
## data: data_table
## X-squared = 73.971, df = 30, p-value = 1.399e-05
data_prop <- data|>
group_by(Criminal.Cases)|>
summarise(winners = sum(Winner))
data_prop|>
ggplot(aes(x = Criminal.Cases, y = winners)) +
geom_point() +
labs(x = "Criminal Cases", y = "Winners") +
ggtitle("Scatterplot of Criminal Cases vs. Winners")
fisher.test(select(data_prop, Criminal.Cases, winners), simulate.p.value = TRUE)
##
## Fisher's Exact Test for Count Data with simulated p-value (based on
## 2000 replicates)
##
## data: select(data_prop, Criminal.Cases, winners)
## p-value = 0.0004998
## alternative hypothesis: two.sided