R-final-project

Dataset:

Dataset: Loksabha 2019 Candidates General Information. (https://www.kaggle.com/datasets/themlphdstudent/lok-sabha-election-candidate-list-2004-to-2019)

Goal : To find what factors made a candidate a win the election apart from number of votes.

Importing libraries

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(ggrepel)
library(ggthemes)

Loading Dataset

# Loading our dataset

data <-read.csv('C:\\Users\\bhush\\Downloads\\Coursework\\I 590 INTRO TO R\\datasets\\data_final\\LokSabha2019_xl.csv')

# Dimensions of our data: Rows - Columns

dim(data)

## [1] 7968   10

print(summary(data))

##   Candidate            Party           Criminal.Cases      Education        
##  Length:7968        Length:7968        Min.   :  0.0000   Length:7968       
##  Class :character   Class :character   1st Qu.:  0.0000   Class :character  
##  Mode  :character   Mode  :character   Median :  0.0000   Mode  :character  
##                                        Mean   :  0.5732                     
##                                        3rd Qu.:  0.0000                     
##                                        Max.   :240.0000                     
##                                                                             
##       Age         Total.Assets       Constituency        Liabilities       
##  Min.   :24.00   Min.   :9.000e+00   Length:7968        Min.   :0.000e+00  
##  1st Qu.:38.00   1st Qu.:5.040e+05   Class :character   1st Qu.:0.000e+00  
##  Median :46.00   Median :2.752e+06   Mode  :character   Median :0.000e+00  
##  Mean   :47.12   Mean   :4.201e+07                      Mean   :5.576e+06  
##  3rd Qu.:56.00   3rd Qu.:1.309e+07                      3rd Qu.:6.190e+05  
##  Max.   :90.00   Max.   :1.108e+10                      Max.   :1.548e+09  
##                  NA's   :60                                                
##      Winner           Gender         
##  Min.   :0.00000   Length:7968       
##  1st Qu.:0.00000   Class :character  
##  Median :0.00000   Mode  :character  
##  Mean   :0.03828                     
##  3rd Qu.:0.00000                     
##  Max.   :1.00000                     
##

head(data)

##               Candidate                              Party Criminal.Cases
## 1    Kuldeep Rai Sharma                                INC              0
## 2           Ayan Mandal                               AITC              0
## 3        C G Saji Kumar All India Hindustan Congress Party              0
## 4           C U Rasheed                                IND              0
## 5 Gour Chandra Majumder                                IND              0
## 6                 Henry                                IND              0
##               Education Age Total.Assets                Constituency
## 1 Graduate Professional  52    132233012 Andaman And Nicobar Islands
## 2              Graduate  30      7270440 Andaman And Nicobar Islands
## 3             12th Pass  48       120000 Andaman And Nicobar Islands
## 4             12th Pass  34       202808 Andaman And Nicobar Islands
## 5              Graduate  52      6062000 Andaman And Nicobar Islands
## 6             10th Pass  50        56459 Andaman And Nicobar Islands
##   Liabilities Winner Gender
## 1    80450870      1      M
## 2     1500000      0      M
## 3           0      0      M
## 4     1700000      0      M
## 5           0      0      M
## 6           0      0      M

Data Visualization:

Number of Winners:

ggplot(data, aes(x = Winner)) +
  geom_bar(fill = "skyblue", color = "black") +
   geom_text(stat = "count", aes(label = after_stat(count)), vjust = -0.4) +
  labs(title = "Count of Winners",
       x = "Winner",
       y = "Count") +
  theme_minimal()

Age Distribution

data |>
  ggplot(aes(x = Age)) +
  geom_histogram(fill = "skyblue", color = "black", bins = 30, alpha = 0.7) +
  labs(title = "Distribution of Age",
       x = "Age",
       y = "Count") +
  theme_minimal()

Distribution of age - by education

data$newW <- ifelse(data$Winner == 1, "W", "L")

ggplot(data, aes(x = Age, fill = Education)) +
  geom_bar(color = "white", bins = 40, alpha = 0.7) +
  labs(
    title = "Distribution of Age by Education",
    x = "Age",
    y = "Count"
  ) +
  theme_minimal() +
  theme(
    legend.position = "top",
    plot.title = element_text(hjust = 0.5),
    axis.text = element_text(size = 10),
    axis.title = element_text(size = 12),
    legend.text = element_text(size = 10),
    legend.title = element_text(size = 12),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  )

## Warning in geom_bar(color = "white", bins = 40, alpha = 0.7): Ignoring unknown
## parameters: `bins`

Distribution of Winners - by education

data |>
  filter(Winner == 1) |>
  group_by(Education) |>
  summarise(n = n()) |>
  ggplot(aes(x = Education, y = n, fill = Education)) +
  geom_bar(stat = "identity", color = "white") +
  labs(title = "Distribution of Winners by Education",
       x = "Education",
       y = "Count") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  scale_fill_viridis_d()

Age distribution for political parties:

mean_age <- mean(data$Age)
data|>
  filter(Party=="INC")|>
  ggplot()+
  geom_jitter(mapping=aes(y=Party, x=Age, color = newW, size=6, alpha = 0.7))+
  geom_vline(mapping=aes(xintercept=mean_age), linetype = "dashed", color = "blue", size = 1)+
  geom_vline(mapping=aes(xintercept=mean(Age)), linetype = "dashed", color = "orange", size = 1)+
labs(
    title = "Age Distribution for INC Party Members",
    x = "Age",
    y = "Party",
    caption = "Vertical lines represent mean ages, blue for overall mean"
  ) +
  theme_minimal() +
  theme(
    legend.position = "bottom",
    plot.title = element_text(hjust = 0.5),
    axis.text = element_text(size = 10),
    axis.title = element_text(size = 12),
    legend.text = element_text(size = 10),
    legend.title = element_text(size = 12)
  )

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

mean_age <- mean(data$Age)
data|>
  filter(Party=="BJP")|>
  ggplot()+
  geom_jitter(mapping=aes(y=Party, x=Age, color = newW, size=6, alpha = 0.7))+
  geom_vline(mapping=aes(xintercept=mean_age), linetype = "dashed", color = "blue", size = 1)+
  geom_vline(mapping=aes(xintercept=mean(Age)), linetype = "dashed", color = "orange", size = 1)+
labs(
    title = "Age Distribution for BJP Party Members",
    x = "Age",
    y = "Party",
    caption = "Vertical lines represent mean ages, blue for overall mean"
  ) +
  theme_minimal() +
  theme(
    legend.position = "bottom",
    plot.title = element_text(hjust = 0.5),
    axis.text = element_text(size = 10),
    axis.title = element_text(size = 12),
    legend.text = element_text(size = 10),
    legend.title = element_text(size = 12)
  )

Parties ranked by number of winning candidates:

data |>
  group_by(Party) |>
  summarise(n = n()) |>
  arrange(desc(n)) |>
  slice_head(n = 7) |>
  ggplot() +
  geom_col(mapping = aes(x = fct_reorder(Party, n), y = n, fill = Party)) +
  labs(
    title = "Top 7 Party Distribution",
    x = "Party",
    y = "Count"
  ) +
  theme_minimal() +
  theme(
    legend.position = "none",
    plot.title = element_text(hjust = 0.5),
    axis.text.x = element_text(angle = 45, hjust = 1),
    axis.text = element_text(size = 10),
    axis.title = element_text(size = 12)
  )

Total Assets - by Party

mean_assets <- mean(data$Total.Assets, na.rm=TRUE)
data|>
  filter(Party=="BJP")|>
  ggplot()+
  geom_jitter(mapping=aes(y=Party, x=Total.Assets, color = newW, size=6,alpha = 0.7))+
  geom_vline(mapping=aes(xintercept=mean_assets), linetype = "dashed", color = "blue", size = 1)+
  geom_vline(mapping=aes(xintercept=mean(Total.Assets, na.rm=TRUE)), linetype = "dashed", color = "orange", size = 1)+
  labs(
    title = "Total Assets Distribution for BJP Party Members",
    x = "Total Assets",
    y = "Party",
    caption = "Vertical lines represent mean total assets, blue represents for whole"
  ) +
  theme_minimal() +
  theme(
    legend.position = "bottom",
    plot.title = element_text(hjust = 0.5),
    axis.text = element_text(size = 10),
    axis.title = element_text(size = 12),
    legend.text = element_text(size = 10),
    legend.title = element_text(size = 12)
  )

mean_assets <- mean(data$Total.Assets, na.rm=TRUE)
data|>
  filter(Party=="INC")|>
  ggplot()+
  geom_jitter(mapping=aes(y=Party, x=Total.Assets, color = newW, size=6,alpha = 0.7))+
  geom_vline(mapping=aes(xintercept=mean_assets), linetype = "dashed", color = "blue", size = 1)+
  geom_vline(mapping=aes(xintercept=mean(Total.Assets, na.rm=TRUE)), linetype = "dashed", color = "orange", size = 1)+
  labs(
    title = "Total Assets Distribution for INC Party Members",
    x = "Total Assets",
    y = "Party",
    caption = "Vertical lines represent mean total assets, blue represents for whole"
  ) +
  theme_minimal() +
  theme(
    legend.position = "bottom",
    plot.title = element_text(hjust = 0.5),
    axis.text = element_text(size = 10),
    axis.title = element_text(size = 12),
    legend.text = element_text(size = 10),
    legend.title = element_text(size = 12)
  )

Gender Analysis

data |>
  group_by(Gender, Winner) |>
  summarise(n = n()) |>
  ggplot() +
  geom_bar(mapping = aes(x = Gender, y = n, fill = as.factor(Winner)), stat = "identity", position = "dodge") +
   geom_text(
    aes(x = Gender, y = n, label = n),
    stat = "identity",
    position = position_dodge(width = 0.9),
    vjust = -0.15,
    size = 4,
    color = "black"
  ) +
  labs(
    title = "Distribution of Winners by Gender",
    x = "Gender",
    y = "Count"
  ) +
  scale_fill_manual(values = c("0" = "lightgray", "1" = "blue"), name = "Winner") +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5),
    axis.text.x = element_text(angle = 45, hjust = 1),
    axis.text = element_text(size = 10),
    axis.title = element_text(size = 12),
    legend.position = "top",
    legend.text = element_text(size = 10),
    legend.title = element_text(size = 12)
  )

## `summarise()` has grouped output by 'Gender'. You can override using the
## `.groups` argument.

Total Assets - by gender

mean_assets <- mean(data$Total.Assets, na.rm=TRUE)
data|>
  filter(Party=="BJP")|>
  filter(Gender=="F")|>
  ggplot()+
  geom_jitter(mapping=aes(y=Party, x=Total.Assets, color = newW, size=6))+
  geom_vline(mapping=aes(xintercept=mean_assets), colour="blue")+
  geom_vline(mapping=aes(xintercept=mean(Total.Assets, na.rm=TRUE)), colour="orange")+
  labs(
    title = "Total Assets Distribution for BJP female Party Members",
    x = "Total Assets",
    y = "Party",
    caption = "Vertical lines represent mean total assets, blue represents for whole"
  ) +
  theme_minimal() +
  theme(
    legend.position = "bottom",
    plot.title = element_text(hjust = 0.5),
    axis.text = element_text(size = 10),
    axis.title = element_text(size = 12),
    legend.text = element_text(size = 10),
    legend.title = element_text(size = 12)
  )

Some Statistical tests!

Let’s assume that there is no relationship between the criminal cases registered on a candidate and the result of the election(Winner)

We do some tests to find it out!

data<- data|>
          mutate(crime_hist = ifelse(data$Criminal.Cases > 0, "Y", "N"))

data |>
  group_by(crime_hist,Winner) |>
  summarise(n = n())|>
  ggplot(aes(x = factor(Winner), y = n, fill = crime_hist)) +
  geom_bar(stat = "identity", position = "dodge", color = "white") +
  labs(title = "Distribution of Winners by Crime History",
       x = "Winner",
       y = "Count") +
  theme_minimal() +
  scale_fill_manual(values = c("N" = "blue", "Y" = "red"))  # Adjust colors as needed

## `summarise()` has grouped output by 'crime_hist'. You can override using the
## `.groups` argument.

# Mann-Whitney U Test
mwu_test_result <- wilcox.test(data$Winner, data$Criminal.Cases)


print(mwu_test_result)

## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  data$Winner and data$Criminal.Cases
## W = 26841217, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0

<!--#

The test uses a non-parametric method to assess whether there is a difference between two independent groups (Winner, Criminal Cases) in terms of the distribution of their values because here we are not having a normal distribution.

In summary:

Test Statistic (W): 26841217 This is the test statistic, which is basically the sum of the ranks for one group.
P-value: < 2.2e-16 (very close to zero) This means there’s a very low chance that the observed difference in medians is due to random chance.
Conclusion: We reject the null hypothesis; there is evidence of a significant difference in the distribution of Criminal.Cases between different levels of Winner -->

We assume that the relation between the Total assets of the Candidate and the Candidate’s result in the election are not related.

Let’s do some tests to find out!

data <-
  data |>
    mutate(pop_half = ifelse(Total.Assets >= mean(Total.Assets, na.rm=TRUE),
                                 "upper half",
                                 "lower half")) |>
    ungroup()

options(repr.plot.height = 180)
        
data %>%
  filter(Winner == 1) %>%
  ggplot() +
  geom_point(
    mapping = aes(x = Total.Assets, y = Party, color = pop_half),
    size = 2  # Adjust the size of the points as needed
  ) +
  scale_colour_brewer(palette = "Dark2") +
  theme_hc(base_size = 15) +  # Adjust the base size of the theme as needed
  theme(
    legend.position = "right",  # Adjust legend position as needed
    plot.title = element_text(hjust = 0.5, size = 20),  # Adjust title size
    axis.text = element_text(size = 8),  # Adjust axis text size
    axis.title = element_text(size = 15),  # Adjust axis title size
    legend.text = element_text(size = 8),  # Adjust legend text size
    legend.title = element_text(size = 10)  # Adjust legend title size
  )

# Mann-Whitney U Test
mwu_test_result <- wilcox.test(data$Winner, data$Total.Assets,paired = FALSE)

print(mwu_test_result)

## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  data$Winner and data$Total.Assets
## W = 0, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0

We again use a wilcox test to assess whether there is a difference between two independent groups (Winner, Total Assets) in terms of the distribution of their values because here we are not having a normal distribution.