Loksabha_data_dive

Creating different samples within our data

sub_samp <- 6

sub_samp_size <- nrow(data)/2

sub_samp_list <- list()

for (i in 1:sub_samp) {
  
  subsample_indices <- sample(1:nrow(data), sub_samp_size)
  subsample <- data[subsample_indices, ]
  

  sub_samp_list[[paste("df", i, sep = "_")]] <- subsample
}

We create 6 different sample having 50% data from our main data.

length(sub_samp_list)

## [1] 6

view(sub_samp_list[2])

Converting our samples to individual dataframes.

df_1 <- data.frame(sub_samp_list[1])
df_2 <- data.frame(sub_samp_list[2])
df_3 <- data.frame(sub_samp_list[3])
df_4 <- data.frame(sub_samp_list[4])
df_5 <- data.frame(sub_samp_list[5])
df_6 <- data.frame(sub_samp_list[6])

Selecting required columns.

df_1 <- df_1[,c("df_1.Candidate","df_1.Party","df_1.Criminal.Cases","df_1.Education","df_1.Age","df_1.Constituency","df_1.Total.Assets","df_1.Winner","df_1.Gender")]

df_2 <- df_2[,c("df_2.Candidate","df_2.Party","df_2.Criminal.Cases","df_2.Education","df_2.Age","df_2.Constituency","df_2.Total.Assets","df_2.Winner","df_2.Gender")]

df_3 <- df_3[,c("df_3.Candidate","df_3.Party","df_3.Criminal.Cases","df_3.Education","df_3.Age","df_3.Constituency","df_3.Total.Assets","df_3.Winner","df_3.Gender")]

df_4 <- df_4[,c("df_4.Candidate","df_4.Party","df_4.Criminal.Cases","df_4.Education","df_4.Age","df_4.Constituency","df_4.Total.Assets","df_4.Winner","df_4.Gender")]

df_5 <- df_5[,c("df_5.Candidate","df_5.Party","df_5.Criminal.Cases","df_5.Education","df_5.Age","df_5.Constituency","df_5.Total.Assets","df_5.Winner","df_5.Gender")]

df_6 <- df_6[,c("df_6.Candidate","df_6.Party","df_6.Criminal.Cases","df_6.Education","df_6.Age","df_6.Constituency","df_6.Total.Assets","df_6.Winner","df_6.Gender")]

Observing Age distribution in each of our sample.

ggplot(df_1, aes(x = df_1.Age)) +
  geom_histogram(binwidth = 5, fill = "blue") +
  labs(title = "Age Distribution", x = "Age", y = "Frequency")

ggplot(df_2, aes(x = df_2.Age)) +
  geom_histogram(binwidth = 5, fill = "blue") +
  labs(title = "Age Distribution", x = "Age", y = "Frequency")

ggplot(df_3, aes(x = df_3.Age)) +
  geom_histogram(binwidth = 5, fill = "blue") +
  labs(title = "Age Distribution", x = "Age", y = "Frequency")

ggplot(df_4, aes(x = df_4.Age)) +
  geom_histogram(binwidth = 5, fill = "blue") +
  labs(title = "Age Distribution", x = "Age", y = "Frequency")

ggplot(df_5, aes(x = df_5.Age)) +
  geom_histogram(binwidth = 5, fill = "blue") +
  labs(title = "Age Distribution", x = "Age", y = "Frequency")

ggplot(df_6, aes(x = df_6.Age)) +
  geom_histogram(binwidth = 5, fill = "blue") +
  labs(title = "Age Distribution", x = "Age", y = "Frequency")

Finding count of winners in each of our sample

Winner_count <- list()
m1 <- (sum(df_1$df_1.Winner == 1))
m2 <- (sum(df_2$df_2.Winner == 1))
m3 <- (sum(df_3$df_3.Winner == 1))
m4 <- (sum(df_4$df_4.Winner == 1))
m5 <- (sum(df_5$df_5.Winner == 1))
m6 <- (sum(df_6$df_6.Winner == 1))

Winner_count <- c(Winner_count,m1,m2,m3,m4,m5,m6)

print(Winner_count)

## [[1]]
## [1] 138
## 
## [[2]]
## [1] 149
## 
## [[3]]
## [1] 165
## 
## [[4]]
## [1] 161
## 
## [[5]]
## [1] 158
## 
## [[6]]
## [1] 177

Gender distribution in our samples

Gender <- df_1$df_1.Gender
ggplot(df_1, aes(x = df_1$df_1.Gender, fill = Gender)) +
  geom_bar() +
  geom_text(stat = "count", aes(label = after_stat(count)))+
  labs(title = "Gender Distribution", x = "Gender", y = "Frequency")

Gender2 <- df_2$df_2.Gender
ggplot(df_2, aes(x = df_2$df_2.Gender, fill = Gender2  )) +
  geom_bar() +
  geom_text(stat = "count", aes(label = after_stat(count)))+
  labs(title = "Gender Distribution", x = "Gender", y = "Frequency")

Gender3 <- df_3$df_3.Gender
ggplot(df_3, aes(x = df_3$df_3.Gender, fill = Gender3  )) +
  geom_bar() +
  geom_text(stat = "count", aes(label = after_stat(count)))+
  labs(title = "Gender Distribution", x = "Gender", y = "Frequency")

Gender4 <- df_4$df_4.Gender
ggplot(df_4, aes(x = df_4$df_4.Gender, fill = Gender4  )) +
  geom_bar() +
  geom_text(stat = "count", aes(label = after_stat(count)))+
  labs(title = "Gender Distribution", x = "Gender", y = "Frequency")

Gender5 <- df_5$df_5.Gender
ggplot(df_5, aes(x = df_5$df_5.Gender, fill = Gender5  )) +
  geom_bar() +
  geom_text(stat = "count", aes(label = after_stat(count)))+
  labs(title = "Gender Distribution", x = "Gender", y = "Frequency")

Gender6 <- df_6$df_6.Gender
ggplot(df_6, aes(x = df_6$df_6.Gender, fill = Gender6  )) +
  geom_bar() +
  geom_text(stat = "count", aes(label = after_stat(count)))+
  labs(title = "Gender Distribution", x = "Gender", y = "Frequency")

Finding candidates with no education.

ill6 <- df_6 |>
  group_by(df_6$Education) |>
  summarise(noOfCandidates = sum(df_6$df_6.Education == 'Illiterate'))

ill5 <- df_5 |>
  group_by(df_5$Education) |>
  summarise(noOfCandidates = sum(df_5$df_5.Education == 'Illiterate'))

ill4 <- df_4 |>
  group_by(df_4$Education) |>
  summarise(noOfCandidates = sum(df_4$df_4.Education == 'Illiterate'))

ill3 <- df_3 |>
  group_by(df_3$Education) |>
  summarise(noOfCandidates = sum(df_3$df_3.Education == 'Illiterate'))

ill2 <- df_2 |>
  group_by(df_2$Education) |>
  summarise(noOfCandidates = sum(df_2$df_2.Education == 'Illiterate'))

ill1 <- df_1 |>
  group_by(df_1$Education) |>
  summarise(noOfCandidates = sum(df_1$df_1.Education == 'Illiterate'))


edu_samples <- list(ill1,ill2,ill3,ill4,ill5,ill6)

count_ill <- data.frame(count_ = unlist(edu_samples))

ggplot(data = count_ill, aes(x = as.factor(count_))) +
  geom_bar(fill = "blue") +
  labs(title = "Candidates with having no formal education",
       x = "Sample number",
       y = "Count") +
  theme_minimal()

Loksabha_data_dive_3

2023-09-17

Dataset

Loading our Dataset

Creating different samples within our data

We create 6 different sample having 50% data from our main data.

Converting our samples to individual dataframes.

Selecting required columns.

Observing Age distribution in each of our sample.

Finding count of winners in each of our sample

Gender distribution in our samples

Finding candidates with no education.