EDA

MFC <- df
# Convert columns to numeric if they are not already
MFC$totalsave <- as.numeric(as.character(MFC$totalsave))
MFC$shareout <- as.numeric(as.character(MFC$shareout))

## Warning: NAs introduced by coercion

# Create the new column 'profit' using the calculation 'shareout' minus 'totalsave'
MFC$profit <- MFC$shareout - MFC$totalsave

MFC <- MFC[!is.na(MFC$profit), ]

# Create a new dataframe with minimum and maximum currtotalsave values for each uniqueid
new_df <- df %>%
  group_by(uniqueid) %>%
  summarize(min_totalsave = min(totalsave, na.rm = TRUE),
            max_shareout = max(shareout, na.rm = TRUE))

## Warning: There were 6980 warnings in `summarize()`.
## The first warning was:
## ℹ In argument: `min_totalsave = min(totalsave, na.rm = TRUE)`.
## ℹ In group 2: `uniqueid = "Ind-000050"`.
## Caused by warning in `min()`:
## ! no non-missing arguments to min; returning Inf
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 6979 remaining warnings.

# Filter out rows with "inf" values
new_df <- new_df %>%
  filter_all(all_vars(!is.infinite(.)))

# Convert columns to numeric if they are not already
new_df$max_shareout <- as.numeric(as.character(new_df$max_shareout))

## Warning: NAs introduced by coercion

new_df$min_totalsave <- as.numeric(as.character(new_df$min_totalsave))


# Calculate percent increase and add it as a new column
new_df$finalreturn <- ((new_df$max_shareout - new_df$min_totalsave) / new_df$min_totalsave) * 100

# Print the updated dataframe
print(new_df)

## # A tibble: 11,413 × 4
##    uniqueid   min_totalsave max_shareout finalreturn
##    <chr>              <dbl>        <dbl>       <dbl>
##  1 Ind-000032        150000       175400        16.9
##  2 Ind-000207        170000       420600       147. 
##  3 Ind-000208        180000       392950       118. 
##  4 Ind-000211        180000       392950       118. 
##  5 Ind-000212        145000       360500       149. 
##  6 Ind-000218         66000       313900       376. 
##  7 Ind-000219        173000       265150        53.3
##  8 Ind-000220        150000            0      -100  
##  9 Ind-000222        105000       208200        98.3
## 10 Ind-000225        126000       226850        80.0
## # ℹ 11,403 more rows

# Filter out rows with "inf" values
new_df <- new_df %>%
  filter_all(all_vars(!is.infinite(.)))

# Filter out rows with non-finite values in percent_increase
MFC_filtered <- new_df[is.finite(new_df$finalreturn), ]

# Determine suitable binwidth for the histogram
binwidth <- diff(range(MFC_filtered$finalreturn)) / 30  # Adjust the number of bins as needed

# Create histogram of percent_increase
ggplot(MFC_filtered, aes(x = finalreturn)) +
  geom_histogram(binwidth = 5, fill = "skyblue", color = "black") +
  labs(x = "finalreturn", y = "Frequency") +
  ggtitle("Histogram of finalreturn") +
  xlim(0,200) +
  ylim(0,200)

## Warning: Removed 1405 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 16 rows containing missing values or values outside the scale range
## (`geom_bar()`).

MFC_filtered %>% 
  arrange(finalreturn)

## # A tibble: 11,362 × 4
##    uniqueid   min_totalsave max_shareout finalreturn
##    <chr>              <dbl>        <dbl>       <dbl>
##  1 Ind-000220        150000            0        -100
##  2 Ind-000322        100000            0        -100
##  3 Ind-000334        100000            0        -100
##  4 Ind-000416        100000            0        -100
##  5 Ind-000434        100000            0        -100
##  6 Ind-001240        150000            0        -100
##  7 Ind-002907        106000            0        -100
##  8 Ind-003871        220000            0        -100
##  9 Ind-004015        100000            0        -100
## 10 Ind-004019         80000            0        -100
## # ℹ 11,352 more rows

# Create new column based on finalreturn
MFC_filtered$positive_negative <- ifelse(MFC_filtered$finalreturn > 0, "positive", 
                                 ifelse(MFC_filtered$finalreturn < 0, "negative", "zero"))

table(MFC_filtered$positive_negative)

## 
## negative positive     zero 
##      204    11143       15

# Calculate the 1st and 99th percentiles of finalreturn
bottom_1_threshold <- quantile(MFC_filtered$finalreturn, 0.01)
top_1_threshold <- quantile(MFC_filtered$finalreturn, 0.99)

# Create new column "1_percent"
MFC_filtered$one_percent <- ifelse(MFC_filtered$finalreturn < bottom_1_threshold, "bottom",
                           ifelse(MFC_filtered$finalreturn > top_1_threshold, "top", "middle"))

# Calculate the 5th and 95th percentiles of finalreturn
bottom_5_threshold <- quantile(MFC_filtered$finalreturn, 0.05)
top_5_threshold <- quantile(MFC_filtered$finalreturn, 0.95)

# Create new column "5_percent"
MFC_filtered$five_percent <- ifelse(MFC_filtered$finalreturn < bottom_5_threshold, "bottom",
                           ifelse(MFC_filtered$finalreturn > top_5_threshold, "top", "middle"))

# Calculate the 5th and 95th percentiles of finalreturn
bottom_10_threshold <- quantile(MFC_filtered$finalreturn, 0.1)
top_10_threshold <- quantile(MFC_filtered$finalreturn, 0.9)

# Create new column "5_percent"
MFC_filtered$ten_percent <- ifelse(MFC_filtered$finalreturn < bottom_10_threshold, "bottom",
                           ifelse(MFC_filtered$finalreturn > top_10_threshold, "top", "middle"))


# Load the openxlsx package
library(openxlsx)

# Write the dataframe to an Excel file
write.xlsx(MFC_filtered, "individual_return.xlsx")

merged_df <- left_join(MFC_filtered, df, by = "uniqueid")

data <- merged_df

# Group data by ten_percent and approxedulvl, calculate counts
grouped_data <- data %>%
  group_by(ten_percent, ownslandnumeric) %>% #CATEGORY
  summarise(count = n()) %>%
  ungroup()

## `summarise()` has grouped output by 'ten_percent'. You can override using the
## `.groups` argument.

# Calculate percentages within each ten_percent group
grouped_data <- grouped_data %>%
  group_by(ten_percent) %>%
  mutate(percent = count / sum(count) * 100) %>%
  ungroup()

# Plot pie charts for each ten_percent group
pie_charts <- grouped_data %>%
  group_by(ten_percent) %>%
  do(plot_data = ggplot(., aes(x = "", y = percent, fill = ownslandnumeric)) + #CATEGORY
       geom_bar(stat = "identity", width = 1, color = "white") +
       coord_polar("y", start = 0) +
       ggtitle(paste("Pie Chart for Ten Percent Group:", unique(.$ten_percent))) +
       theme_void() +
       theme(legend.position = "bottom") +
       labs(fill = "ownslandmueric")) #CATEGORY

# Combine pie charts into a grid layout

library(patchwork)

wrap_plots(pie_charts$plot_data, nrow = length(unique(grouped_data$ten_percent)))

adjumani incomesteadinessnumeric urbancenterdistance cycle1graduationprofit monthlychurch cycle2graduationprofit age toiletnum cycle3graduationprofit noofcattleorbuffalo noofadultsheepgoatsorpigs depsnoregschool noofotheranimals sizeoffirstloan cycle4graduationprofit

adjumani

# Load necessary libraries
library(ggplot2)

# Calculate averages by incomesteadiness category
median_finalreturn <- aggregate(finalreturn ~ adjumani, data = merged_df, FUN = median)

# Create barplot
ggplot(median_finalreturn, aes(x = adjumani, y = finalreturn)) +
  geom_bar(stat = "identity") +
  labs(x = "Income Steadiness Category", y = "Median Final Return") +
  ggtitle("Median Final Return by Adjumani Category")

# Load necessary library
library(dplyr)

# Create a new column based on conditions
adjumani_gender <- merged_df %>%
  mutate(group = case_when(
    adjumani == 1 & gender == "Male" ~ "adjumani, male",
    adjumani == 1 & gender == "Female" ~ "adjumani, female",
    adjumani == 0 & gender == "Male" ~ "not adjumani, male",
    adjumani == 0 & gender == "Female" ~ "not adjumani, female",
    TRUE ~ NA_character_  # Default value if none of the conditions are met
  ))



# 4 boxplots
ggplot(adjumani_gender, aes(x = group, y = finalreturn)) +
  geom_boxplot(coef = 3) +  # Adjust this coefficient as needed
  labs(x = "Income Steadiness Category", y = "Final Return") +
  ggtitle("Boxplot of Final Return by whether the individual is in Adjumani and Female") +
  ylim(0,200)

## Warning: Removed 4577 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

# Regular 2 boxplots
ggplot(adjumani_gender, aes(x = gender, y = finalreturn)) +
  geom_boxplot(coef = 3) +  # Adjust this coefficient as needed
  labs(x = "Income Steadiness Category", y = "Final Return") +
  ggtitle("Boxplot of Final Return by whether the individual is Female") +
  ylim(0,200)

## Warning: Removed 4577 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

Incomesteadiness

# Load necessary libraries
library(ggplot2)

# Calculate averages by incomesteadiness category
median_finalreturn <- aggregate(finalreturn ~ incomesteadiness, data = merged_df, FUN = median)

# Create barplot
ggplot(median_finalreturn, aes(x = incomesteadiness, y = finalreturn)) +
  geom_bar(stat = "identity") +
  labs(x = "Income Steadiness Category", y = "Median Final Return") +
  ggtitle("Median Final Return by Income Steadiness Category")

# Create boxplot with limited outliers
ggplot(merged_df, aes(x = incomesteadiness, y = finalreturn)) +
  geom_boxplot(coef = 3) +  # Adjust this coefficient as needed
  labs(x = "Income Steadiness Category", y = "Final Return") +
  ggtitle("Boxplot of Final Return by Income Steadiness Category") +
  ylim(0,100)

## Warning: Removed 10000 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

urbancenterdistance

# Load necessary libraries
library(ggplot2)

merged_df$urbancenterdistance <- as.numeric(as.character(merged_df$urbancenterdistance))
# Load necessary libraries

# Filter out missing or non-numeric values in urbancenterdistance
merged_df <- merged_df %>%
  filter(!is.na(urbancenterdistance) & is.numeric(urbancenterdistance))

# Bin urbancenterdistance into groups of ten
merged_df <- merged_df %>%
  mutate(distance_bins = cut(urbancenterdistance, breaks = seq(0, max(urbancenterdistance), by = 50), include.lowest = TRUE))

# Calculate median finalreturn for each distance bin
median_data <- merged_df %>%
  group_by(distance_bins) %>%
  summarise(median_finalreturn = median(finalreturn, na.rm = TRUE))

# Plot median finalreturn for each distance bin as a bar graph
median_distance <- ggplot(median_data, aes(x = distance_bins, y = median_finalreturn)) +
  geom_col(fill = "skyblue", color = "black") +
  labs(x = "Urban Center Distance (Bins)", y = "Median Final Return") +
  ggtitle("Median Final Return for Each Urban Center Distance Bin")

median_distance

# Calculate mean finalreturn for each distance bin
mean_data <- merged_df %>%
  group_by(distance_bins) %>%
  summarise(mean_finalreturn = mean(finalreturn, na.rm = TRUE))

# Plot mean finalreturn for each distance bin as a bar graph
mean_distance <- ggplot(mean_data, aes(x = distance_bins, y = mean_finalreturn)) +
  geom_col(fill = "skyblue", color = "black") +
  labs(x = "Urban Center Distance (Bins)", y = "Mean Final Return") +
  ggtitle("Mean Final Return for Each Urban Center Distance Bin")


mean_distance

## monthlychurch

# Load necessary libraries
library(ggplot2)


# Calculate averages by incomesteadiness category
median_finalreturn <- aggregate(finalreturn ~ monthlychurch, data = merged_df, FUN = median)

# Create barplot
ggplot(median_finalreturn, aes(x = monthlychurch, y = finalreturn)) +
  geom_bar(stat = "identity") +
  labs(x = "monthlychurch Category", y = "Median Final Return") +
  ggtitle("Median Final Return by monthlychurch Category")

# Create boxplot with limited outliers
ggplot(merged_df, aes(x = monthlychurch, y = finalreturn)) +
  geom_boxplot(coef = 3) +  # Adjust this coefficient as needed
  labs(x = "monthlychurch Category", y = "Final Return") +
  ggtitle("Boxplot of Final Return by monthlychurch Category") +
  ylim(0,100)

## Warning: Removed 5212 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

age

merged_df$age <- as.numeric(as.character(merged_df$age))
# Load necessary libraries

# Filter out missing or non-numeric values in urbancenterdistance
merged_df <- merged_df %>%
  filter(!is.na(age) & is.numeric(age))

# Bin age into groups of ten
merged_df <- merged_df %>%
  mutate(distance_bins = cut(age, breaks = seq(0, max(age), by = 10), include.lowest = TRUE))

# Calculate median finalreturn for each distance bin
median_data <- merged_df %>%
  group_by(distance_bins) %>%
  summarise(median_finalreturn = median(finalreturn, na.rm = TRUE))

# Plot median finalreturn for each distance bin as a bar graph
median_age <- ggplot(median_data, aes(x = distance_bins, y = median_finalreturn)) +
  geom_col(fill = "skyblue", color = "black") +
  labs(x = "age (Bins)", y = "Median Final Return") +
  ggtitle("Median Final Return for Each age Bin")

median_age

#delete first two and last columns

noofcattleorbuffalo

merged_df$noofcattleorbuffalo <- as.numeric(as.character(merged_df$noofcattleorbuffalo))
# Load necessary libraries

# Filter out missing or non-numeric values in urbancenterdistance
merged_df <- merged_df %>%
  filter(!is.na(noofcattleorbuffalo) & is.numeric(noofcattleorbuffalo))

# Bin age into groups of ten
merged_df <- merged_df %>%
  mutate(distance_bins = cut(noofcattleorbuffalo, breaks = seq(0, max(noofcattleorbuffalo), by = 10), include.lowest = TRUE))

# Calculate median finalreturn for each distance bin
median_data <- merged_df %>%
  group_by(distance_bins) %>%
  summarise(median_finalreturn = median(finalreturn, na.rm = TRUE))

# Plot median finalreturn for each distance bin as a bar graph
median_noofcattleorbuffalo <- ggplot(median_data, aes(x = distance_bins, y = median_finalreturn)) +
  geom_col(fill = "skyblue", color = "black") +
  labs(x = "noofcattleorbuffalo (Bins)", y = "Median Final Return") +
  ggtitle("Median Final Return for noofcattleorbuffalo")

median_noofcattleorbuffalo

ownsland

# Load necessary libraries
library(ggplot2)


# Calculate averages by incomesteadiness category
median_finalreturn <- aggregate(finalreturn ~ ownsland, data = merged_df, FUN = median)

# Create barplot
median_ownsland <- ggplot(median_finalreturn, aes(x = ownsland, y = finalreturn)) +
  geom_bar(stat = "identity") +
  labs(x = "ownsland", y = "Median Final Return") +
  ggtitle("Median Final Return by ownsland")


# Create boxplot with limited outliers
boxplot_ownsland <- ggplot(merged_df, aes(x = ownsland, y = finalreturn)) +
  geom_boxplot(coef = 3) +  # Adjust this coefficient as needed
  labs(x = "ownsland", y = "Final Return") +
  ggtitle("Boxplot of Final Return by ownsland") +
  ylim(0,100)

boxplot_ownsland

## Warning: Removed 4160 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

MFC Data Exploration (Rough)

Ben, Serene, and Lydia

2024-04-10

EDA

adjumani

Incomesteadiness

urbancenterdistance

age

noofcattleorbuffalo

ownsland