Dataset: Loksabha 2019 Candidates General Information. (https://www.kaggle.com/datasets/themlphdstudent/lok-sabha-election-candidate-list-2004-to-2019)
# Importing required libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggrepel)
# Loading our dataset
data <-read.csv('C:\\Users\\bhush\\Downloads\\Coursework\\I 590 INTRO TO R\\datasets\\data_final\\LokSabha2019_xl.csv')
dim(data)
## [1] 7968 10
sub_samp <- 6
sub_samp_size <- nrow(data)/2
sub_samp_list <- list()
for (i in 1:sub_samp) {
subsample_indices <- sample(1:nrow(data), sub_samp_size)
subsample <- data[subsample_indices, ]
sub_samp_list[[paste("df", i, sep = "_")]] <- subsample
}
length(sub_samp_list)
## [1] 6
view(sub_samp_list[2])
df_1 <- data.frame(sub_samp_list[1])
df_2 <- data.frame(sub_samp_list[2])
df_3 <- data.frame(sub_samp_list[3])
df_4 <- data.frame(sub_samp_list[4])
df_5 <- data.frame(sub_samp_list[5])
df_6 <- data.frame(sub_samp_list[6])
df_1 <- df_1[,c("df_1.Candidate","df_1.Party","df_1.Criminal.Cases","df_1.Education","df_1.Age","df_1.Constituency","df_1.Total.Assets","df_1.Winner","df_1.Gender")]
df_2 <- df_2[,c("df_2.Candidate","df_2.Party","df_2.Criminal.Cases","df_2.Education","df_2.Age","df_2.Constituency","df_2.Total.Assets","df_2.Winner","df_2.Gender")]
df_3 <- df_3[,c("df_3.Candidate","df_3.Party","df_3.Criminal.Cases","df_3.Education","df_3.Age","df_3.Constituency","df_3.Total.Assets","df_3.Winner","df_3.Gender")]
df_4 <- df_4[,c("df_4.Candidate","df_4.Party","df_4.Criminal.Cases","df_4.Education","df_4.Age","df_4.Constituency","df_4.Total.Assets","df_4.Winner","df_4.Gender")]
df_5 <- df_5[,c("df_5.Candidate","df_5.Party","df_5.Criminal.Cases","df_5.Education","df_5.Age","df_5.Constituency","df_5.Total.Assets","df_5.Winner","df_5.Gender")]
df_6 <- df_6[,c("df_6.Candidate","df_6.Party","df_6.Criminal.Cases","df_6.Education","df_6.Age","df_6.Constituency","df_6.Total.Assets","df_6.Winner","df_6.Gender")]
ggplot(df_1, aes(x = df_1.Age)) +
geom_histogram(binwidth = 5, fill = "blue") +
labs(title = "Age Distribution", x = "Age", y = "Frequency")
ggplot(df_2, aes(x = df_2.Age)) +
geom_histogram(binwidth = 5, fill = "blue") +
labs(title = "Age Distribution", x = "Age", y = "Frequency")
ggplot(df_3, aes(x = df_3.Age)) +
geom_histogram(binwidth = 5, fill = "blue") +
labs(title = "Age Distribution", x = "Age", y = "Frequency")
ggplot(df_4, aes(x = df_4.Age)) +
geom_histogram(binwidth = 5, fill = "blue") +
labs(title = "Age Distribution", x = "Age", y = "Frequency")
ggplot(df_5, aes(x = df_5.Age)) +
geom_histogram(binwidth = 5, fill = "blue") +
labs(title = "Age Distribution", x = "Age", y = "Frequency")
ggplot(df_6, aes(x = df_6.Age)) +
geom_histogram(binwidth = 5, fill = "blue") +
labs(title = "Age Distribution", x = "Age", y = "Frequency")
Winner_count <- list()
m1 <- (sum(df_1$df_1.Winner == 1))
m2 <- (sum(df_2$df_2.Winner == 1))
m3 <- (sum(df_3$df_3.Winner == 1))
m4 <- (sum(df_4$df_4.Winner == 1))
m5 <- (sum(df_5$df_5.Winner == 1))
m6 <- (sum(df_6$df_6.Winner == 1))
Winner_count <- c(Winner_count,m1,m2,m3,m4,m5,m6)
print(Winner_count)
## [[1]]
## [1] 138
##
## [[2]]
## [1] 149
##
## [[3]]
## [1] 165
##
## [[4]]
## [1] 161
##
## [[5]]
## [1] 158
##
## [[6]]
## [1] 177
Gender <- df_1$df_1.Gender
ggplot(df_1, aes(x = df_1$df_1.Gender, fill = Gender)) +
geom_bar() +
geom_text(stat = "count", aes(label = after_stat(count)))+
labs(title = "Gender Distribution", x = "Gender", y = "Frequency")
Gender2 <- df_2$df_2.Gender
ggplot(df_2, aes(x = df_2$df_2.Gender, fill = Gender2 )) +
geom_bar() +
geom_text(stat = "count", aes(label = after_stat(count)))+
labs(title = "Gender Distribution", x = "Gender", y = "Frequency")
Gender3 <- df_3$df_3.Gender
ggplot(df_3, aes(x = df_3$df_3.Gender, fill = Gender3 )) +
geom_bar() +
geom_text(stat = "count", aes(label = after_stat(count)))+
labs(title = "Gender Distribution", x = "Gender", y = "Frequency")
Gender4 <- df_4$df_4.Gender
ggplot(df_4, aes(x = df_4$df_4.Gender, fill = Gender4 )) +
geom_bar() +
geom_text(stat = "count", aes(label = after_stat(count)))+
labs(title = "Gender Distribution", x = "Gender", y = "Frequency")
Gender5 <- df_5$df_5.Gender
ggplot(df_5, aes(x = df_5$df_5.Gender, fill = Gender5 )) +
geom_bar() +
geom_text(stat = "count", aes(label = after_stat(count)))+
labs(title = "Gender Distribution", x = "Gender", y = "Frequency")
Gender6 <- df_6$df_6.Gender
ggplot(df_6, aes(x = df_6$df_6.Gender, fill = Gender6 )) +
geom_bar() +
geom_text(stat = "count", aes(label = after_stat(count)))+
labs(title = "Gender Distribution", x = "Gender", y = "Frequency")
ill6 <- df_6 |>
group_by(df_6$Education) |>
summarise(noOfCandidates = sum(df_6$df_6.Education == 'Illiterate'))
ill5 <- df_5 |>
group_by(df_5$Education) |>
summarise(noOfCandidates = sum(df_5$df_5.Education == 'Illiterate'))
ill4 <- df_4 |>
group_by(df_4$Education) |>
summarise(noOfCandidates = sum(df_4$df_4.Education == 'Illiterate'))
ill3 <- df_3 |>
group_by(df_3$Education) |>
summarise(noOfCandidates = sum(df_3$df_3.Education == 'Illiterate'))
ill2 <- df_2 |>
group_by(df_2$Education) |>
summarise(noOfCandidates = sum(df_2$df_2.Education == 'Illiterate'))
ill1 <- df_1 |>
group_by(df_1$Education) |>
summarise(noOfCandidates = sum(df_1$df_1.Education == 'Illiterate'))
edu_samples <- list(ill1,ill2,ill3,ill4,ill5,ill6)
count_ill <- data.frame(count_ = unlist(edu_samples))
ggplot(data = count_ill, aes(x = as.factor(count_))) +
geom_bar(fill = "blue") +
labs(title = "Candidates with having no formal education",
x = "Sample number",
y = "Count") +
theme_minimal()