MFC <- df
# Convert columns to numeric if they are not already
MFC$totalsave <- as.numeric(as.character(MFC$totalsave))
MFC$shareout <- as.numeric(as.character(MFC$shareout))
## Warning: NAs introduced by coercion
# Create the new column 'profit' using the calculation 'shareout' minus 'totalsave'
MFC$profit <- MFC$shareout - MFC$totalsave
MFC <- MFC[!is.na(MFC$profit), ]
# Create a new dataframe with minimum and maximum currtotalsave values for each uniqueid
new_df <- df %>%
group_by(uniqueid) %>%
summarize(min_totalsave = min(totalsave, na.rm = TRUE),
max_shareout = max(shareout, na.rm = TRUE))
## Warning: There were 6980 warnings in `summarize()`.
## The first warning was:
## ℹ In argument: `min_totalsave = min(totalsave, na.rm = TRUE)`.
## ℹ In group 2: `uniqueid = "Ind-000050"`.
## Caused by warning in `min()`:
## ! no non-missing arguments to min; returning Inf
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 6979 remaining warnings.
# Filter out rows with "inf" values
new_df <- new_df %>%
filter_all(all_vars(!is.infinite(.)))
# Convert columns to numeric if they are not already
new_df$max_shareout <- as.numeric(as.character(new_df$max_shareout))
## Warning: NAs introduced by coercion
new_df$min_totalsave <- as.numeric(as.character(new_df$min_totalsave))
# Calculate percent increase and add it as a new column
new_df$finalreturn <- ((new_df$max_shareout - new_df$min_totalsave) / new_df$min_totalsave) * 100
# Print the updated dataframe
print(new_df)
## # A tibble: 11,413 × 4
## uniqueid min_totalsave max_shareout finalreturn
## <chr> <dbl> <dbl> <dbl>
## 1 Ind-000032 150000 175400 16.9
## 2 Ind-000207 170000 420600 147.
## 3 Ind-000208 180000 392950 118.
## 4 Ind-000211 180000 392950 118.
## 5 Ind-000212 145000 360500 149.
## 6 Ind-000218 66000 313900 376.
## 7 Ind-000219 173000 265150 53.3
## 8 Ind-000220 150000 0 -100
## 9 Ind-000222 105000 208200 98.3
## 10 Ind-000225 126000 226850 80.0
## # ℹ 11,403 more rows
# Filter out rows with "inf" values
new_df <- new_df %>%
filter_all(all_vars(!is.infinite(.)))
# Filter out rows with non-finite values in percent_increase
MFC_filtered <- new_df[is.finite(new_df$finalreturn), ]
# Determine suitable binwidth for the histogram
binwidth <- diff(range(MFC_filtered$finalreturn)) / 30 # Adjust the number of bins as needed
# Create histogram of percent_increase
ggplot(MFC_filtered, aes(x = finalreturn)) +
geom_histogram(binwidth = 5, fill = "skyblue", color = "black") +
labs(x = "finalreturn", y = "Frequency") +
ggtitle("Histogram of finalreturn") +
xlim(0,200) +
ylim(0,200)
## Warning: Removed 1405 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 16 rows containing missing values or values outside the scale range
## (`geom_bar()`).
MFC_filtered %>%
arrange(finalreturn)
## # A tibble: 11,362 × 4
## uniqueid min_totalsave max_shareout finalreturn
## <chr> <dbl> <dbl> <dbl>
## 1 Ind-000220 150000 0 -100
## 2 Ind-000322 100000 0 -100
## 3 Ind-000334 100000 0 -100
## 4 Ind-000416 100000 0 -100
## 5 Ind-000434 100000 0 -100
## 6 Ind-001240 150000 0 -100
## 7 Ind-002907 106000 0 -100
## 8 Ind-003871 220000 0 -100
## 9 Ind-004015 100000 0 -100
## 10 Ind-004019 80000 0 -100
## # ℹ 11,352 more rows
# Create new column based on finalreturn
MFC_filtered$positive_negative <- ifelse(MFC_filtered$finalreturn > 0, "positive",
ifelse(MFC_filtered$finalreturn < 0, "negative", "zero"))
table(MFC_filtered$positive_negative)
##
## negative positive zero
## 204 11143 15
# Calculate the 1st and 99th percentiles of finalreturn
bottom_1_threshold <- quantile(MFC_filtered$finalreturn, 0.01)
top_1_threshold <- quantile(MFC_filtered$finalreturn, 0.99)
# Create new column "1_percent"
MFC_filtered$one_percent <- ifelse(MFC_filtered$finalreturn < bottom_1_threshold, "bottom",
ifelse(MFC_filtered$finalreturn > top_1_threshold, "top", "middle"))
# Calculate the 5th and 95th percentiles of finalreturn
bottom_5_threshold <- quantile(MFC_filtered$finalreturn, 0.05)
top_5_threshold <- quantile(MFC_filtered$finalreturn, 0.95)
# Create new column "5_percent"
MFC_filtered$five_percent <- ifelse(MFC_filtered$finalreturn < bottom_5_threshold, "bottom",
ifelse(MFC_filtered$finalreturn > top_5_threshold, "top", "middle"))
# Calculate the 5th and 95th percentiles of finalreturn
bottom_10_threshold <- quantile(MFC_filtered$finalreturn, 0.1)
top_10_threshold <- quantile(MFC_filtered$finalreturn, 0.9)
# Create new column "5_percent"
MFC_filtered$ten_percent <- ifelse(MFC_filtered$finalreturn < bottom_10_threshold, "bottom",
ifelse(MFC_filtered$finalreturn > top_10_threshold, "top", "middle"))
# Load the openxlsx package
library(openxlsx)
# Write the dataframe to an Excel file
write.xlsx(MFC_filtered, "individual_return.xlsx")
merged_df <- left_join(MFC_filtered, df, by = "uniqueid")
data <- merged_df
# Group data by ten_percent and approxedulvl, calculate counts
grouped_data <- data %>%
group_by(ten_percent, ownslandnumeric) %>% #CATEGORY
summarise(count = n()) %>%
ungroup()
## `summarise()` has grouped output by 'ten_percent'. You can override using the
## `.groups` argument.
# Calculate percentages within each ten_percent group
grouped_data <- grouped_data %>%
group_by(ten_percent) %>%
mutate(percent = count / sum(count) * 100) %>%
ungroup()
# Plot pie charts for each ten_percent group
pie_charts <- grouped_data %>%
group_by(ten_percent) %>%
do(plot_data = ggplot(., aes(x = "", y = percent, fill = ownslandnumeric)) + #CATEGORY
geom_bar(stat = "identity", width = 1, color = "white") +
coord_polar("y", start = 0) +
ggtitle(paste("Pie Chart for Ten Percent Group:", unique(.$ten_percent))) +
theme_void() +
theme(legend.position = "bottom") +
labs(fill = "ownslandmueric")) #CATEGORY
# Combine pie charts into a grid layout
library(patchwork)
wrap_plots(pie_charts$plot_data, nrow = length(unique(grouped_data$ten_percent)))
adjumani incomesteadinessnumeric urbancenterdistance cycle1graduationprofit monthlychurch cycle2graduationprofit age toiletnum cycle3graduationprofit noofcattleorbuffalo noofadultsheepgoatsorpigs depsnoregschool noofotheranimals sizeoffirstloan cycle4graduationprofit
# Load necessary libraries
library(ggplot2)
# Calculate averages by incomesteadiness category
median_finalreturn <- aggregate(finalreturn ~ adjumani, data = merged_df, FUN = median)
# Create barplot
ggplot(median_finalreturn, aes(x = adjumani, y = finalreturn)) +
geom_bar(stat = "identity") +
labs(x = "Income Steadiness Category", y = "Median Final Return") +
ggtitle("Median Final Return by Adjumani Category")
# Load necessary library
library(dplyr)
# Create a new column based on conditions
adjumani_gender <- merged_df %>%
mutate(group = case_when(
adjumani == 1 & gender == "Male" ~ "adjumani, male",
adjumani == 1 & gender == "Female" ~ "adjumani, female",
adjumani == 0 & gender == "Male" ~ "not adjumani, male",
adjumani == 0 & gender == "Female" ~ "not adjumani, female",
TRUE ~ NA_character_ # Default value if none of the conditions are met
))
# 4 boxplots
ggplot(adjumani_gender, aes(x = group, y = finalreturn)) +
geom_boxplot(coef = 3) + # Adjust this coefficient as needed
labs(x = "Income Steadiness Category", y = "Final Return") +
ggtitle("Boxplot of Final Return by whether the individual is in Adjumani and Female") +
ylim(0,200)
## Warning: Removed 4577 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
# Regular 2 boxplots
ggplot(adjumani_gender, aes(x = gender, y = finalreturn)) +
geom_boxplot(coef = 3) + # Adjust this coefficient as needed
labs(x = "Income Steadiness Category", y = "Final Return") +
ggtitle("Boxplot of Final Return by whether the individual is Female") +
ylim(0,200)
## Warning: Removed 4577 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
# Load necessary libraries
library(ggplot2)
# Calculate averages by incomesteadiness category
median_finalreturn <- aggregate(finalreturn ~ incomesteadiness, data = merged_df, FUN = median)
# Create barplot
ggplot(median_finalreturn, aes(x = incomesteadiness, y = finalreturn)) +
geom_bar(stat = "identity") +
labs(x = "Income Steadiness Category", y = "Median Final Return") +
ggtitle("Median Final Return by Income Steadiness Category")
# Create boxplot with limited outliers
ggplot(merged_df, aes(x = incomesteadiness, y = finalreturn)) +
geom_boxplot(coef = 3) + # Adjust this coefficient as needed
labs(x = "Income Steadiness Category", y = "Final Return") +
ggtitle("Boxplot of Final Return by Income Steadiness Category") +
ylim(0,100)
## Warning: Removed 10000 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
# Load necessary libraries
library(ggplot2)
merged_df$urbancenterdistance <- as.numeric(as.character(merged_df$urbancenterdistance))
# Load necessary libraries
# Filter out missing or non-numeric values in urbancenterdistance
merged_df <- merged_df %>%
filter(!is.na(urbancenterdistance) & is.numeric(urbancenterdistance))
# Bin urbancenterdistance into groups of ten
merged_df <- merged_df %>%
mutate(distance_bins = cut(urbancenterdistance, breaks = seq(0, max(urbancenterdistance), by = 50), include.lowest = TRUE))
# Calculate median finalreturn for each distance bin
median_data <- merged_df %>%
group_by(distance_bins) %>%
summarise(median_finalreturn = median(finalreturn, na.rm = TRUE))
# Plot median finalreturn for each distance bin as a bar graph
median_distance <- ggplot(median_data, aes(x = distance_bins, y = median_finalreturn)) +
geom_col(fill = "skyblue", color = "black") +
labs(x = "Urban Center Distance (Bins)", y = "Median Final Return") +
ggtitle("Median Final Return for Each Urban Center Distance Bin")
median_distance
# Calculate mean finalreturn for each distance bin
mean_data <- merged_df %>%
group_by(distance_bins) %>%
summarise(mean_finalreturn = mean(finalreturn, na.rm = TRUE))
# Plot mean finalreturn for each distance bin as a bar graph
mean_distance <- ggplot(mean_data, aes(x = distance_bins, y = mean_finalreturn)) +
geom_col(fill = "skyblue", color = "black") +
labs(x = "Urban Center Distance (Bins)", y = "Mean Final Return") +
ggtitle("Mean Final Return for Each Urban Center Distance Bin")
mean_distance
## monthlychurch
# Load necessary libraries
library(ggplot2)
# Calculate averages by incomesteadiness category
median_finalreturn <- aggregate(finalreturn ~ monthlychurch, data = merged_df, FUN = median)
# Create barplot
ggplot(median_finalreturn, aes(x = monthlychurch, y = finalreturn)) +
geom_bar(stat = "identity") +
labs(x = "monthlychurch Category", y = "Median Final Return") +
ggtitle("Median Final Return by monthlychurch Category")
# Create boxplot with limited outliers
ggplot(merged_df, aes(x = monthlychurch, y = finalreturn)) +
geom_boxplot(coef = 3) + # Adjust this coefficient as needed
labs(x = "monthlychurch Category", y = "Final Return") +
ggtitle("Boxplot of Final Return by monthlychurch Category") +
ylim(0,100)
## Warning: Removed 5212 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
merged_df$age <- as.numeric(as.character(merged_df$age))
# Load necessary libraries
# Filter out missing or non-numeric values in urbancenterdistance
merged_df <- merged_df %>%
filter(!is.na(age) & is.numeric(age))
# Bin age into groups of ten
merged_df <- merged_df %>%
mutate(distance_bins = cut(age, breaks = seq(0, max(age), by = 10), include.lowest = TRUE))
# Calculate median finalreturn for each distance bin
median_data <- merged_df %>%
group_by(distance_bins) %>%
summarise(median_finalreturn = median(finalreturn, na.rm = TRUE))
# Plot median finalreturn for each distance bin as a bar graph
median_age <- ggplot(median_data, aes(x = distance_bins, y = median_finalreturn)) +
geom_col(fill = "skyblue", color = "black") +
labs(x = "age (Bins)", y = "Median Final Return") +
ggtitle("Median Final Return for Each age Bin")
median_age
#delete first two and last columns
merged_df$noofcattleorbuffalo <- as.numeric(as.character(merged_df$noofcattleorbuffalo))
# Load necessary libraries
# Filter out missing or non-numeric values in urbancenterdistance
merged_df <- merged_df %>%
filter(!is.na(noofcattleorbuffalo) & is.numeric(noofcattleorbuffalo))
# Bin age into groups of ten
merged_df <- merged_df %>%
mutate(distance_bins = cut(noofcattleorbuffalo, breaks = seq(0, max(noofcattleorbuffalo), by = 10), include.lowest = TRUE))
# Calculate median finalreturn for each distance bin
median_data <- merged_df %>%
group_by(distance_bins) %>%
summarise(median_finalreturn = median(finalreturn, na.rm = TRUE))
# Plot median finalreturn for each distance bin as a bar graph
median_noofcattleorbuffalo <- ggplot(median_data, aes(x = distance_bins, y = median_finalreturn)) +
geom_col(fill = "skyblue", color = "black") +
labs(x = "noofcattleorbuffalo (Bins)", y = "Median Final Return") +
ggtitle("Median Final Return for noofcattleorbuffalo")
median_noofcattleorbuffalo
# Load necessary libraries
library(ggplot2)
# Calculate averages by incomesteadiness category
median_finalreturn <- aggregate(finalreturn ~ ownsland, data = merged_df, FUN = median)
# Create barplot
median_ownsland <- ggplot(median_finalreturn, aes(x = ownsland, y = finalreturn)) +
geom_bar(stat = "identity") +
labs(x = "ownsland", y = "Median Final Return") +
ggtitle("Median Final Return by ownsland")
# Create boxplot with limited outliers
boxplot_ownsland <- ggplot(merged_df, aes(x = ownsland, y = finalreturn)) +
geom_boxplot(coef = 3) + # Adjust this coefficient as needed
labs(x = "ownsland", y = "Final Return") +
ggtitle("Boxplot of Final Return by ownsland") +
ylim(0,100)
boxplot_ownsland
## Warning: Removed 4160 rows containing non-finite outside the scale range
## (`stat_boxplot()`).