Dataset

Dataset: Loksabha 2019 Candidates General Information. (https://www.kaggle.com/datasets/themlphdstudent/lok-sabha-election-candidate-list-2004-to-2019)

# Importing required libraries

library(tidyverse) 
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggrepel)

Loading our dataset

data <-read.csv('C:\\Users\\bhush\\Downloads\\Coursework\\I 590 INTRO TO R\\datasets\\data_final\\LokSabha2019_xl.csv')

Calculating Equity of each candidate and storing in a new column:

We know that “Assets - Liabilities = Equity” (More information on: Link)

data_copy <- data
# Calculating the equity for each party's candidates
data_copy_ <- data_copy|>
    mutate(equity = data_copy$Total.Assets - data_copy$Liabilities)

Age Deviation for the each party’s candidates

# Finding the age deviation for different parties.
data_copy2 <- data_copy|>
        group_by(Party,Winner)|>
          mutate(Age_deviation = as.integer(Age - mean(Age)))|>
            ungroup()|>
              select(Candidate,Party,Age,Age_deviation,Winner)

Ranking based on Equity amount ( For accessing the candidate’s wealth )

data_copy3 <- data_copy |>
          mutate(Equity_amt = data_copy$Total.Assets - data_copy$Liabilities)|>
          mutate(real_wealth_rank = rank(desc(Equity_amt)))
          

data_copy3<- data_copy3 |> 
  arrange((real_wealth_rank))

Calculating Male to Female Ratio for each constituency.

data_copy4 <- data_copy |>
  group_by(Constituency) |>
  summarise(
    Male_Count = sum(Gender == 'M'),
    Female_Count = sum(Gender == 'F'),
  )|>
  mutate(Male_to_Female_Ratio = Male_Count / (Female_Count+1))|>
  ungroup()
# Plotting Male to Female ratio against Constituency using a jitter plot
data_copy4 |>
  ggplot() +
  geom_jitter(mapping = aes(x = Male_to_Female_Ratio, y = Constituency)) +
  geom_text_repel(
    data = filter(data_copy4, Male_to_Female_Ratio > 24),
    mapping = aes(x = Male_to_Female_Ratio, y = Constituency, label = Constituency),
    color = "Blue"
  ) +
  theme(axis.text.y = element_blank())

We can observe an outlier, let’s find more about it:

outlier<- data_copy4|>
  filter(Male_to_Female_Ratio>75)

print(outlier)
## # A tibble: 1 × 4
##   Constituency Male_Count Female_Count Male_to_Female_Ratio
##   <chr>             <int>        <int>                <dbl>
## 1 Nizamabad           182            1                   91

Plotting the equities for the winning candidates compared to the mean equity amount

mean_equity_amt <- mean(data_copy3$Equity_amt, na.rm=TRUE)
data_copy3$color_ <- ifelse(data_copy3$Equity_amt > mean_equity_amt, "Above Mean", "Below Mean")
data_copy3|>
  filter(Winner==1)|>
  ggplot()+
  geom_jitter(mapping = aes(x=Equity_amt,y=Gender,color=color_))

Finding correlation among various numeric factors in our data set.

The positive return value indicates that the ‘x’ value tends to increase as a function of ‘y’ value and vice versa.

cor(x = data_copy3$Age, y = data_copy3$Criminal.Cases, use = "complete.obs")
## [1] 0.02234815
cor(x = data_copy3$Criminal.Cases, data_copy3$Winner, use = "complete.obs")
## [1] -0.02815683
cor(x = data_copy3$Age, data_copy3$Total.Assets, use = "complete.obs")
## [1] 0.111774
cor(x = data_copy3$Age, data_copy3$Equity_amt, use = "complete.obs")
## [1] 0.1083645