Dataset: Loksabha 2019 Candidates General Information. (https://www.kaggle.com/datasets/themlphdstudent/lok-sabha-election-candidate-list-2004-to-2019)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggrepel)
library(ggthemes)
# Loading our dataset
data <-read.csv('C:\\Users\\bhush\\Downloads\\Coursework\\I 590 INTRO TO R\\datasets\\data_final\\LokSabha2019_xl.csv')
# Dimensions of our data: Rows - Columns
dim(data)
## [1] 7968 10
print(summary(data))
## Candidate Party Criminal.Cases Education
## Length:7968 Length:7968 Min. : 0.0000 Length:7968
## Class :character Class :character 1st Qu.: 0.0000 Class :character
## Mode :character Mode :character Median : 0.0000 Mode :character
## Mean : 0.5732
## 3rd Qu.: 0.0000
## Max. :240.0000
##
## Age Total.Assets Constituency Liabilities
## Min. :24.00 Min. :9.000e+00 Length:7968 Min. :0.000e+00
## 1st Qu.:38.00 1st Qu.:5.040e+05 Class :character 1st Qu.:0.000e+00
## Median :46.00 Median :2.752e+06 Mode :character Median :0.000e+00
## Mean :47.12 Mean :4.201e+07 Mean :5.576e+06
## 3rd Qu.:56.00 3rd Qu.:1.309e+07 3rd Qu.:6.190e+05
## Max. :90.00 Max. :1.108e+10 Max. :1.548e+09
## NA's :60
## Winner Gender
## Min. :0.00000 Length:7968
## 1st Qu.:0.00000 Class :character
## Median :0.00000 Mode :character
## Mean :0.03828
## 3rd Qu.:0.00000
## Max. :1.00000
##
head(data)
## Candidate Party Criminal.Cases
## 1 Kuldeep Rai Sharma INC 0
## 2 Ayan Mandal AITC 0
## 3 C G Saji Kumar All India Hindustan Congress Party 0
## 4 C U Rasheed IND 0
## 5 Gour Chandra Majumder IND 0
## 6 Henry IND 0
## Education Age Total.Assets Constituency
## 1 Graduate Professional 52 132233012 Andaman And Nicobar Islands
## 2 Graduate 30 7270440 Andaman And Nicobar Islands
## 3 12th Pass 48 120000 Andaman And Nicobar Islands
## 4 12th Pass 34 202808 Andaman And Nicobar Islands
## 5 Graduate 52 6062000 Andaman And Nicobar Islands
## 6 10th Pass 50 56459 Andaman And Nicobar Islands
## Liabilities Winner Gender
## 1 80450870 1 M
## 2 1500000 0 M
## 3 0 0 M
## 4 1700000 0 M
## 5 0 0 M
## 6 0 0 M
ggplot(data, aes(x = Winner)) +
geom_bar(fill = "skyblue", color = "black") +
geom_text(stat = "count", aes(label = after_stat(count)), vjust = -0.4) +
labs(title = "Count of Winners",
x = "Winner",
y = "Count") +
theme_minimal()
data |>
ggplot(aes(x = Age)) +
geom_histogram(fill = "skyblue", color = "black", bins = 30, alpha = 0.7) +
labs(title = "Distribution of Age",
x = "Age",
y = "Count") +
theme_minimal()
data$newW <- ifelse(data$Winner == 1, "W", "L")
ggplot(data, aes(x = Age, fill = Education)) +
geom_bar(color = "white", bins = 40, alpha = 0.7) +
labs(
title = "Distribution of Age by Education",
x = "Age",
y = "Count"
) +
theme_minimal() +
theme(
legend.position = "top",
plot.title = element_text(hjust = 0.5),
axis.text = element_text(size = 10),
axis.title = element_text(size = 12),
legend.text = element_text(size = 10),
legend.title = element_text(size = 12),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()
)
## Warning in geom_bar(color = "white", bins = 40, alpha = 0.7): Ignoring unknown
## parameters: `bins`
data |>
filter(Winner == 1) |>
group_by(Education) |>
summarise(n = n()) |>
ggplot(aes(x = Education, y = n, fill = Education)) +
geom_bar(stat = "identity", color = "white") +
labs(title = "Distribution of Winners by Education",
x = "Education",
y = "Count") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_fill_viridis_d()
mean_age <- mean(data$Age)
data|>
filter(Party=="INC")|>
ggplot()+
geom_jitter(mapping=aes(y=Party, x=Age, color = newW, size=6, alpha = 0.7))+
geom_vline(mapping=aes(xintercept=mean_age), linetype = "dashed", color = "blue", size = 1)+
geom_vline(mapping=aes(xintercept=mean(Age)), linetype = "dashed", color = "orange", size = 1)+
labs(
title = "Age Distribution for INC Party Members",
x = "Age",
y = "Party",
caption = "Vertical lines represent mean ages, blue for overall mean"
) +
theme_minimal() +
theme(
legend.position = "bottom",
plot.title = element_text(hjust = 0.5),
axis.text = element_text(size = 10),
axis.title = element_text(size = 12),
legend.text = element_text(size = 10),
legend.title = element_text(size = 12)
)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
mean_age <- mean(data$Age)
data|>
filter(Party=="BJP")|>
ggplot()+
geom_jitter(mapping=aes(y=Party, x=Age, color = newW, size=6, alpha = 0.7))+
geom_vline(mapping=aes(xintercept=mean_age), linetype = "dashed", color = "blue", size = 1)+
geom_vline(mapping=aes(xintercept=mean(Age)), linetype = "dashed", color = "orange", size = 1)+
labs(
title = "Age Distribution for BJP Party Members",
x = "Age",
y = "Party",
caption = "Vertical lines represent mean ages, blue for overall mean"
) +
theme_minimal() +
theme(
legend.position = "bottom",
plot.title = element_text(hjust = 0.5),
axis.text = element_text(size = 10),
axis.title = element_text(size = 12),
legend.text = element_text(size = 10),
legend.title = element_text(size = 12)
)
data |>
group_by(Party) |>
summarise(n = n()) |>
arrange(desc(n)) |>
slice_head(n = 7) |>
ggplot() +
geom_col(mapping = aes(x = fct_reorder(Party, n), y = n, fill = Party)) +
labs(
title = "Top 7 Party Distribution",
x = "Party",
y = "Count"
) +
theme_minimal() +
theme(
legend.position = "none",
plot.title = element_text(hjust = 0.5),
axis.text.x = element_text(angle = 45, hjust = 1),
axis.text = element_text(size = 10),
axis.title = element_text(size = 12)
)
mean_assets <- mean(data$Total.Assets, na.rm=TRUE)
data|>
filter(Party=="BJP")|>
ggplot()+
geom_jitter(mapping=aes(y=Party, x=Total.Assets, color = newW, size=6,alpha = 0.7))+
geom_vline(mapping=aes(xintercept=mean_assets), linetype = "dashed", color = "blue", size = 1)+
geom_vline(mapping=aes(xintercept=mean(Total.Assets, na.rm=TRUE)), linetype = "dashed", color = "orange", size = 1)+
labs(
title = "Total Assets Distribution for BJP Party Members",
x = "Total Assets",
y = "Party",
caption = "Vertical lines represent mean total assets, blue represents for whole"
) +
theme_minimal() +
theme(
legend.position = "bottom",
plot.title = element_text(hjust = 0.5),
axis.text = element_text(size = 10),
axis.title = element_text(size = 12),
legend.text = element_text(size = 10),
legend.title = element_text(size = 12)
)
mean_assets <- mean(data$Total.Assets, na.rm=TRUE)
data|>
filter(Party=="INC")|>
ggplot()+
geom_jitter(mapping=aes(y=Party, x=Total.Assets, color = newW, size=6,alpha = 0.7))+
geom_vline(mapping=aes(xintercept=mean_assets), linetype = "dashed", color = "blue", size = 1)+
geom_vline(mapping=aes(xintercept=mean(Total.Assets, na.rm=TRUE)), linetype = "dashed", color = "orange", size = 1)+
labs(
title = "Total Assets Distribution for INC Party Members",
x = "Total Assets",
y = "Party",
caption = "Vertical lines represent mean total assets, blue represents for whole"
) +
theme_minimal() +
theme(
legend.position = "bottom",
plot.title = element_text(hjust = 0.5),
axis.text = element_text(size = 10),
axis.title = element_text(size = 12),
legend.text = element_text(size = 10),
legend.title = element_text(size = 12)
)
data |>
group_by(Gender, Winner) |>
summarise(n = n()) |>
ggplot() +
geom_bar(mapping = aes(x = Gender, y = n, fill = as.factor(Winner)), stat = "identity", position = "dodge") +
geom_text(
aes(x = Gender, y = n, label = n),
stat = "identity",
position = position_dodge(width = 0.9),
vjust = -0.15,
size = 4,
color = "black"
) +
labs(
title = "Distribution of Winners by Gender",
x = "Gender",
y = "Count"
) +
scale_fill_manual(values = c("0" = "lightgray", "1" = "blue"), name = "Winner") +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5),
axis.text.x = element_text(angle = 45, hjust = 1),
axis.text = element_text(size = 10),
axis.title = element_text(size = 12),
legend.position = "top",
legend.text = element_text(size = 10),
legend.title = element_text(size = 12)
)
## `summarise()` has grouped output by 'Gender'. You can override using the
## `.groups` argument.
mean_assets <- mean(data$Total.Assets, na.rm=TRUE)
data|>
filter(Party=="BJP")|>
filter(Gender=="F")|>
ggplot()+
geom_jitter(mapping=aes(y=Party, x=Total.Assets, color = newW, size=6))+
geom_vline(mapping=aes(xintercept=mean_assets), colour="blue")+
geom_vline(mapping=aes(xintercept=mean(Total.Assets, na.rm=TRUE)), colour="orange")+
labs(
title = "Total Assets Distribution for BJP female Party Members",
x = "Total Assets",
y = "Party",
caption = "Vertical lines represent mean total assets, blue represents for whole"
) +
theme_minimal() +
theme(
legend.position = "bottom",
plot.title = element_text(hjust = 0.5),
axis.text = element_text(size = 10),
axis.title = element_text(size = 12),
legend.text = element_text(size = 10),
legend.title = element_text(size = 12)
)
We do some tests to find it out!
data<- data|>
mutate(crime_hist = ifelse(data$Criminal.Cases > 0, "Y", "N"))
data |>
group_by(crime_hist,Winner) |>
summarise(n = n())|>
ggplot(aes(x = factor(Winner), y = n, fill = crime_hist)) +
geom_bar(stat = "identity", position = "dodge", color = "white") +
labs(title = "Distribution of Winners by Crime History",
x = "Winner",
y = "Count") +
theme_minimal() +
scale_fill_manual(values = c("N" = "blue", "Y" = "red")) # Adjust colors as needed
## `summarise()` has grouped output by 'crime_hist'. You can override using the
## `.groups` argument.
# Mann-Whitney U Test
mwu_test_result <- wilcox.test(data$Winner, data$Criminal.Cases)
print(mwu_test_result)
##
## Wilcoxon rank sum test with continuity correction
##
## data: data$Winner and data$Criminal.Cases
## W = 26841217, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
<!--#
The test uses a non-parametric method to assess whether there is a difference between two independent groups (Winner, Criminal Cases) in terms of the distribution of their values because here we are not having a normal distribution.
In summary:
Test Statistic (W): 26841217 This is the test statistic, which is basically the sum of the ranks for one group.
P-value: < 2.2e-16 (very close to zero) This means there’s a very low chance that the observed difference in medians is due to random chance.
Conclusion: We reject the null hypothesis; there
is evidence of a significant difference in the distribution of
Criminal.Cases between different levels of
Winner -->