# Load required libraries
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(readr)
setwd("~/Desktop/Indian State") #setting the working directory
rm(list=ls())
# Read the data
data <- read_csv("Constituent Assembly Members.csv")
## New names:
## • `Party` -> `Party...6`
## • `Party` -> `Party...14`
## Rows: 323 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (16): Name, Year of Death, Community, Religion, Caste, Party...6, Active...
## dbl (2): Years active, yrs as MP /MLA
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Clean and prepare the data
# Convert relevant columns to appropriate types and handle missing values/whitespace
clean_data <- data %>%
mutate(
Active = trimws(Active), # Remove any leading/trailing whitespace
Positions = trimws(Positions),
Dynasty = trimws(Dynasty),
# Convert years to numeric, handling missing values
`Years active` = as.numeric(`Years active`),
`yrs as MP /MLA` = as.numeric(`yrs as MP /MLA`)
)
# 1. Active (Categorical: Yes/No)
active_summary <- clean_data %>%
count(Active) %>%
mutate(Percentage = n / sum(n) * 100)
# 2. Years active (Numerical)
years_active_summary <- clean_data %>%
filter(Active == "Yes") %>%
summarise(
Count = sum(!is.na(`Years active`)),
Mean = mean(`Years active`, na.rm = TRUE),
Median = median(`Years active`, na.rm = TRUE),
Min = min(`Years active`, na.rm = TRUE),
Max = max(`Years active`, na.rm = TRUE)
)
# 3. Yrs as MP/MLA (Numerical)
mp_mla_summary <- clean_data %>%
summarise(
Count = sum(!is.na(`yrs as MP /MLA`)),
Mean = mean(`yrs as MP /MLA`, na.rm = TRUE),
Median = median(`yrs as MP /MLA`, na.rm = TRUE),
Min = min(`yrs as MP /MLA`, na.rm = TRUE),
Max = max(`yrs as MP /MLA`, na.rm = TRUE)
)
# 4. Positions (Categorical: Yes/No)
positions_summary <- clean_data %>%
count(Positions) %>%
mutate(Percentage = n / sum(n) * 100)
# 5. Dynasty (Categorical: Yes/No)
dynasty_summary <- clean_data %>%
count(Dynasty) %>%
mutate(Percentage = n / sum(n) * 100)
# Print all summaries
print("Active Summary:")
## [1] "Active Summary:"
print(active_summary)
## # A tibble: 3 × 3
## Active n Percentage
## <chr> <int> <dbl>
## 1 No 82 25.4
## 2 Yes 223 69.0
## 3 <NA> 18 5.57
print("Years Active Summary:")
## [1] "Years Active Summary:"
print(years_active_summary)
## # A tibble: 1 × 5
## Count Mean Median Min Max
## <int> <dbl> <dbl> <dbl> <dbl>
## 1 222 14.9 12.5 0 50
print("MP/MLA Years Summary:")
## [1] "MP/MLA Years Summary:"
print(mp_mla_summary)
## # A tibble: 1 × 5
## Count Mean Median Min Max
## <int> <dbl> <dbl> <dbl> <dbl>
## 1 304 7.65 5 0 44
print("Positions Summary:")
## [1] "Positions Summary:"
print(positions_summary)
## # A tibble: 3 × 3
## Positions n Percentage
## <chr> <int> <dbl>
## 1 No 154 47.7
## 2 Yes 150 46.4
## 3 <NA> 19 5.88
print("Dynasty Summary:")
## [1] "Dynasty Summary:"
print(dynasty_summary)
## # A tibble: 3 × 3
## Dynasty n Percentage
## <chr> <int> <dbl>
## 1 No 253 78.3
## 2 Yes 49 15.2
## 3 <NA> 21 6.50
# Create visualizations
library(ggplot2)
# Bar plot for Active
ggplot(clean_data, aes(x = Active)) +
geom_bar() +
labs(title = "Distribution of Active Members")

# Histogram for Years Active
ggplot(clean_data %>% filter(Active == "Yes"), aes(x = `Years active`)) +
geom_histogram(binwidth = 5) +
labs(title = "Distribution of Years Active")
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).

# Histogram for Years as MP/MLA
ggplot(clean_data, aes(x = `yrs as MP /MLA`)) +
geom_histogram(binwidth = 5) +
labs(title = "Distribution of Years as MP/MLA")
## Warning: Removed 19 rows containing non-finite outside the scale range
## (`stat_bin()`).

# Bar plot for Positions
ggplot(clean_data, aes(x = Positions)) +
geom_bar() +
labs(title = "Distribution of Positions Held")

# Bar plot for Dynasty
ggplot(clean_data, aes(x = Dynasty)) +
geom_bar() +
labs(title = "Distribution of Political Dynasties")

clean_data <- data %>%
mutate(
Active = trimws(Active),
Positions = trimws(Positions),
Dynasty = trimws(Dynasty),
# Convert to factors for better analysis
Positions = factor(Positions, levels = c("Yes", "No")),
Dynasty = factor(Dynasty, levels = c("Yes", "No"))
) %>%
# Filter out rows with missing data for these variables
filter(!is.na(Positions) & !is.na(Dynasty) & Positions %in% c("Yes", "No") & Dynasty %in% c("Yes", "No"))
# Create a contingency table
contingency_table <- table(clean_data$Positions, clean_data$Dynasty)
print("Contingency Table:")
## [1] "Contingency Table:"
print(contingency_table)
##
## Yes No
## Yes 27 119
## No 22 132
# Calculate proportions
prop_table <- prop.table(contingency_table, margin = 1) * 100
print("Proportions Table (by Position):")
## [1] "Proportions Table (by Position):"
print(prop_table)
##
## Yes No
## Yes 18.49315 81.50685
## No 14.28571 85.71429
# Perform chi-squared test for independence
chi_test <- chisq.test(contingency_table)
print("Chi-Squared Test Results:")
## [1] "Chi-Squared Test Results:"
print(chi_test)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: contingency_table
## X-squared = 0.68739, df = 1, p-value = 0.4071
# Calculate odds ratio
odds_ratio <- (contingency_table[1, 1] * contingency_table[2, 2]) /
(contingency_table[1, 2] * contingency_table[2, 1])
print(paste("Odds Ratio:", round(odds_ratio, 2)))
## [1] "Odds Ratio: 1.36"
# Create a visualization
ggplot(clean_data, aes(x = Positions, fill = Dynasty)) +
geom_bar(position = "fill") +
scale_y_continuous(labels = scales::percent_format()) +
labs(
title = "Relationship Between Holding Positions and Political Dynasty",
x = "Held Position",
y = "Percentage",
fill = "Political Dynasty"
) +
theme_minimal()

# Alternative visualization: grouped bar chart
ggplot(clean_data, aes(x = Positions, fill = Dynasty)) +
geom_bar(position = "dodge") +
labs(
title = "Count of Positions by Dynasty Status",
x = "Held Position",
y = "Count",
fill = "Political Dynasty"
) +
theme_minimal()
