# Load required libraries
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(readr)

setwd("~/Desktop/Indian State") #setting the working directory
rm(list=ls())

# Read the data
data <- read_csv("Constituent Assembly Members.csv")
## New names:
## • `Party` -> `Party...6`
## • `Party` -> `Party...14`
## Rows: 323 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (16): Name, Year of Death, Community, Religion, Caste, Party...6, Active...
## dbl  (2): Years active, yrs as MP /MLA
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Clean and prepare the data
# Convert relevant columns to appropriate types and handle missing values/whitespace
clean_data <- data %>%
  mutate(
    Active = trimws(Active),  # Remove any leading/trailing whitespace
    Positions = trimws(Positions),
    Dynasty = trimws(Dynasty),
    # Convert years to numeric, handling missing values
    `Years active` = as.numeric(`Years active`),
    `yrs as MP /MLA` = as.numeric(`yrs as MP /MLA`)
  )

# 1. Active (Categorical: Yes/No)
active_summary <- clean_data %>%
  count(Active) %>%
  mutate(Percentage = n / sum(n) * 100)

# 2. Years active (Numerical)
years_active_summary <- clean_data %>%
  filter(Active == "Yes") %>%
  summarise(
    Count = sum(!is.na(`Years active`)),
    Mean = mean(`Years active`, na.rm = TRUE),
    Median = median(`Years active`, na.rm = TRUE),
    Min = min(`Years active`, na.rm = TRUE),
    Max = max(`Years active`, na.rm = TRUE)
  )

# 3. Yrs as MP/MLA (Numerical)
mp_mla_summary <- clean_data %>%
  summarise(
    Count = sum(!is.na(`yrs as MP /MLA`)),
    Mean = mean(`yrs as MP /MLA`, na.rm = TRUE),
    Median = median(`yrs as MP /MLA`, na.rm = TRUE),
    Min = min(`yrs as MP /MLA`, na.rm = TRUE),
    Max = max(`yrs as MP /MLA`, na.rm = TRUE)
  )

# 4. Positions (Categorical: Yes/No)
positions_summary <- clean_data %>%
  count(Positions) %>%
  mutate(Percentage = n / sum(n) * 100)

# 5. Dynasty (Categorical: Yes/No)
dynasty_summary <- clean_data %>%
  count(Dynasty) %>%
  mutate(Percentage = n / sum(n) * 100)

# Print all summaries
print("Active Summary:")
## [1] "Active Summary:"
print(active_summary)
## # A tibble: 3 × 3
##   Active     n Percentage
##   <chr>  <int>      <dbl>
## 1 No        82      25.4 
## 2 Yes      223      69.0 
## 3 <NA>      18       5.57
print("Years Active Summary:")
## [1] "Years Active Summary:"
print(years_active_summary)
## # A tibble: 1 × 5
##   Count  Mean Median   Min   Max
##   <int> <dbl>  <dbl> <dbl> <dbl>
## 1   222  14.9   12.5     0    50
print("MP/MLA Years Summary:")
## [1] "MP/MLA Years Summary:"
print(mp_mla_summary)
## # A tibble: 1 × 5
##   Count  Mean Median   Min   Max
##   <int> <dbl>  <dbl> <dbl> <dbl>
## 1   304  7.65      5     0    44
print("Positions Summary:")
## [1] "Positions Summary:"
print(positions_summary)
## # A tibble: 3 × 3
##   Positions     n Percentage
##   <chr>     <int>      <dbl>
## 1 No          154      47.7 
## 2 Yes         150      46.4 
## 3 <NA>         19       5.88
print("Dynasty Summary:")
## [1] "Dynasty Summary:"
print(dynasty_summary)
## # A tibble: 3 × 3
##   Dynasty     n Percentage
##   <chr>   <int>      <dbl>
## 1 No        253      78.3 
## 2 Yes        49      15.2 
## 3 <NA>       21       6.50
# Create visualizations
library(ggplot2)

# Bar plot for Active
ggplot(clean_data, aes(x = Active)) +
  geom_bar() +
  labs(title = "Distribution of Active Members")

# Histogram for Years Active
ggplot(clean_data %>% filter(Active == "Yes"), aes(x = `Years active`)) +
  geom_histogram(binwidth = 5) +
  labs(title = "Distribution of Years Active")
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).

# Histogram for Years as MP/MLA
ggplot(clean_data, aes(x = `yrs as MP /MLA`)) +
  geom_histogram(binwidth = 5) +
  labs(title = "Distribution of Years as MP/MLA")
## Warning: Removed 19 rows containing non-finite outside the scale range
## (`stat_bin()`).

# Bar plot for Positions
ggplot(clean_data, aes(x = Positions)) +
  geom_bar() +
  labs(title = "Distribution of Positions Held")

# Bar plot for Dynasty
ggplot(clean_data, aes(x = Dynasty)) +
  geom_bar() +
  labs(title = "Distribution of Political Dynasties")

clean_data <- data %>%
  mutate(
    Active = trimws(Active),
    Positions = trimws(Positions),
    Dynasty = trimws(Dynasty),
    # Convert to factors for better analysis
    Positions = factor(Positions, levels = c("Yes", "No")),
    Dynasty = factor(Dynasty, levels = c("Yes", "No"))
  ) %>%
  # Filter out rows with missing data for these variables
  filter(!is.na(Positions) & !is.na(Dynasty) & Positions %in% c("Yes", "No") & Dynasty %in% c("Yes", "No"))

# Create a contingency table
contingency_table <- table(clean_data$Positions, clean_data$Dynasty)
print("Contingency Table:")
## [1] "Contingency Table:"
print(contingency_table)
##      
##       Yes  No
##   Yes  27 119
##   No   22 132
# Calculate proportions
prop_table <- prop.table(contingency_table, margin = 1) * 100
print("Proportions Table (by Position):")
## [1] "Proportions Table (by Position):"
print(prop_table)
##      
##            Yes       No
##   Yes 18.49315 81.50685
##   No  14.28571 85.71429
# Perform chi-squared test for independence
chi_test <- chisq.test(contingency_table)
print("Chi-Squared Test Results:")
## [1] "Chi-Squared Test Results:"
print(chi_test)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  contingency_table
## X-squared = 0.68739, df = 1, p-value = 0.4071
# Calculate odds ratio
odds_ratio <- (contingency_table[1, 1] * contingency_table[2, 2]) / 
  (contingency_table[1, 2] * contingency_table[2, 1])
print(paste("Odds Ratio:", round(odds_ratio, 2)))
## [1] "Odds Ratio: 1.36"
# Create a visualization
ggplot(clean_data, aes(x = Positions, fill = Dynasty)) +
  geom_bar(position = "fill") +
  scale_y_continuous(labels = scales::percent_format()) +
  labs(
    title = "Relationship Between Holding Positions and Political Dynasty",
    x = "Held Position",
    y = "Percentage",
    fill = "Political Dynasty"
  ) +
  theme_minimal()

# Alternative visualization: grouped bar chart
ggplot(clean_data, aes(x = Positions, fill = Dynasty)) +
  geom_bar(position = "dodge") +
  labs(
    title = "Count of Positions by Dynasty Status",
    x = "Held Position",
    y = "Count",
    fill = "Political Dynasty"
  ) +
  theme_minimal()