Name: Grace Wheeler Date: 02/06/2025

Problem Set Part II

# Load any necessary packages 
# Load necessary libraries
library(knitr)
library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.4     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows
install.packages("tinytex", repos = "https://cloud.r-project.org/")
## 
## The downloaded binary packages are in
##  /var/folders/3x/hr9kf03s4qqgkgd1psy8k7_80000gn/T//RtmpM7VS8D/downloaded_packages
library("tinytex")
knitr::opts_knit$set(root.dir = "/Users/gracewheeler/Desktop/Pols300_Spring2025/Programming labs/DSS")
# Load your data


# Verify working directory
print(getwd())
## [1] "/Users/gracewheeler/Desktop/Pols300_Spring2025/Programming labs/DSS"
# Check if the file exists
print(list.files())
##  [1] "BES.csv"                        "countries.csv"                 
##  [3] "ecuador_data.csv"               "Experimental.R"                
##  [5] "Introduction.R"                 "MPP.csv"                       
##  [7] "Observational.R"                "Population.R"                  
##  [9] "Prediction.R"                   "Probability.R"                 
## [11] "procurement_loroguay_1.csv"     "Programming Lab 4.nb.html"     
## [13] "Programming Lab 4.Rmd"          "Programming Lab3.nb.html"      
## [15] "Programming Lab3.Rmd"           "Programming Lab4.nb.html"      
## [17] "Programming Lab4.Rmd"           "Programming-Lab-4.pdf"         
## [19] "Programming-Lab3.pdf"           "Programming-Lab4.pdf"          
## [21] "Programminglab3.pdf"            "ProgrammingLab3Wheeler.nb.html"
## [23] "ProgrammingLab3Wheeler.Rmd"     "README.md"                     
## [25] "STAR.csv"                       "UA_precincts.csv"              
## [27] "UA_survey.csv"                  "UK_districts.csv"              
## [29] "Uncertainty.R"                  "urbanization-vs-gdp.csv"
# Load data
df_loroguay <- read.csv("procurement_loroguay_1.csv")
head(df_loroguay)
##   award_connected award_sole contract_value transparency citycode
## 1              NA          1             NA           NA 12100101
## 2              NA         NA          31500       0.7823 12100101
## 3               0         NA          59565       0.7823 12100101
## 4               0          1             NA       0.7823 12100101
## 5               0          1          11952       0.7823 12100101
## 6               0          0             NA           NA 12100101
##   contract_purpose high_percapita_gdp
## 1                2                  1
## 2                3                  1
## 3                3                  1
## 4                4                  1
## 5                3                 NA
## 6                2                  1

(0.25 points) What is the average number of contracts given to the sole bidder in our sample? What is the standard deviation?

code to arrive at the answer:

# Inspect the column names to confirm
names(df_loroguay)
## [1] "award_connected"    "award_sole"         "contract_value"    
## [4] "transparency"       "citycode"           "contract_purpose"  
## [7] "high_percapita_gdp"
# Ensure the column "award_sole" is numeric
df_loroguay$award_sole <- as.numeric(as.character(df_loroguay$award_sole))
# Check for NA values in "award_sole"
sum(is.na(df_loroguay$award_sole))  # Count the NA values
## [1] 147
# Calculate the mean and standard deviation, ignoring NA values
mean_award_sole <- mean(df_loroguay$award_sole, na.rm = TRUE)
sd_award_sole <- sd(df_loroguay$award_sole, na.rm = TRUE)
# Print results
cat("Average number of contracts awarded to sole bidders:", mean_award_sole, "\n")
## Average number of contracts awarded to sole bidders: 0.5980975
cat("Standard deviation:", sd_award_sole, "\n")
## Standard deviation: 0.4905742

Average: 0.5980975

Standard deviation: 0.4905742

(0.25 points) What is the average number of contracts given to connected firms in our sample? What is the standard deviation?

code to arrive at the answer:

# Check for NA values in "award_connected" column
sum(is.na(df_loroguay$award_connected))  # Count the NA values
## [1] 149
# Calculate the average (mean) and standard deviation, excluding NA values
mean_award_connected <- mean(df_loroguay$award_connected, na.rm = TRUE)
sd_award_connected <- sd(df_loroguay$award_connected, na.rm = TRUE)
# Print results
cat("Average number of contracts awarded to connected firms:", mean_award_connected, "\n")
## Average number of contracts awarded to connected firms: 0.3742551
cat("Standard deviation:", sd_award_connected, "\n")
## Standard deviation: 0.4842186

answer: Average: 0.3742551 Standard deviation: 0.4842186

(0.25 points) What is the average number of contracts given to connected firms in municipalities with a high per-capita GDP? What is the standard deviation?

code to arrive at the answer:

# Filter the data for municipalities with high per-capita GDP
high_gdp_data <- subset(df_loroguay, high_percapita_gdp == 1)  # Assuming 1 means high GDP
# Calculate the average (mean) and standard deviation for connected firms in high-GDP municipalities
mean_high_gdp_connected <- mean(high_gdp_data$award_connected, na.rm = TRUE)
sd_high_gdp_connected <- sd(high_gdp_data$award_connected, na.rm = TRUE)
# Print the results
cat("Average number of contracts awarded to connected firms in high-GDP municipalities:", mean_high_gdp_connected, "\n")
## Average number of contracts awarded to connected firms in high-GDP municipalities: 0.2707424
cat("Standard deviation:", sd_high_gdp_connected, "\n")
## Standard deviation: 0.4453166

answer: Average: 0.2707424 Standard deviation: 0.4453166

(0.25 points) What is the average number of construction contracts given to connected firms in our sample? What is the standard deviation?

code to arrive at the answer:

# Filter the data for construction contracts and connected firms
construction_data <- subset(df_loroguay, grepl("construction", contract_purpose, ignore.case = TRUE) & !is.na(award_connected))
# Calculate the average (mean) and standard deviation for connected firms with construction contracts
mean_construction_connected <- mean(construction_data$award_connected, na.rm = TRUE)
sd_construction_connected <- sd(construction_data$award_connected, na.rm = TRUE)
# Print the results
cat("Average number of construction contracts awarded to connected firms:", mean_construction_connected, "\n")
## Average number of construction contracts awarded to connected firms: NaN
cat("Standard deviation:", sd_construction_connected, "\n")
## Standard deviation: NA

answer:

(0.25 points) What is the average contract value in the sample (in US dollars).

code to arrive at the answer:

# Calculate the average contract value (ignoring NA values)
mean_contract_value <- mean(df_loroguay$contract_value, na.rm = TRUE)

# Print the result
cat("Average contract value in the sample (in US dollars):", mean_contract_value, "\n")
## Average contract value in the sample (in US dollars): 677642.7

answer:677642.7

(0.25 points) What is the average contract value (in US dollars) for contracts given to connected firms?

code to arrive at the answer:

# Ensure the columns are numeric
df_loroguay$contract_value <- as.numeric(as.character(df_loroguay$contract_value))
df_loroguay$award_connected <- as.numeric(as.character(df_loroguay$award_connected))

# Filter for rows where the firm is connected
connected_firms <- subset(df_loroguay, award_connected == 1)

# Calculate the average contract value for connected firms
mean_connected_value <- mean(connected_firms$contract_value, na.rm = TRUE)

# Print the result
cat("Average contract value for contracts given to connected firms (in US dollars):", mean_connected_value, "\n")
## Average contract value for contracts given to connected firms (in US dollars): 1288678

answer: 1288678

(1 point) Create a nicely formatted summary statistics table including only the relevant variables (i.e., exclude citycode).

code to arrive at the answer:

relevant_vars <- df_loroguay %>%
  select(award_connected, award_sole, contract_value, transparency, contract_purpose, high_percapita_gdp)

summary_stats <- relevant_vars %>%
  summarise(
    across(
      where(is.numeric),
      list(
        Mean = ~mean(., na.rm = TRUE),
        SD = ~sd(., na.rm = TRUE),
        Min = ~min(., na.rm = TRUE),
        Max = ~max(., na.rm = TRUE)
      )
    )
  ) %>%
  pivot_longer(cols = everything(), names_to = c("Variable", "Statistic"), names_sep = "_") %>%
  pivot_wider(names_from = Statistic, values_from = value)
## Warning: Expected 2 pieces. Additional pieces discarded in 20 rows [1, 2, 3, 4, 5, 6, 7,
## 8, 9, 10, 11, 12, 17, 18, 19, 20, 21, 22, 23, 24].
## Warning: Values from `value` are not uniquely identified; output will contain list-cols.
## • Use `values_fn = list` to suppress this warning.
## • Use `values_fn = {summary_fun}` to summarise duplicates.
## • Use the following dplyr code to identify duplicates.
##   {data} |>
##   dplyr::summarise(n = dplyr::n(), .by = c(Variable, Statistic)) |>
##   dplyr::filter(n > 1L)
# Format as a nicely styled table
summary_stats %>%
  kable(format = "html", caption = "Summary Statistics for Relevant Variables") %>%
  kable_styling(full_width = FALSE, bootstrap_options = c("striped", "hover", "condensed"))
Summary Statistics for Relevant Variables
Variable connected sole value Mean SD Min Max purpose percapita
award 0.3742551, 0.4842186, 0.0000000, 1.0000000 0.5980975, 0.4905742, 0.0000000, 1.0000000 NULL NULL NULL NULL NULL NULL NULL
contract NULL NULL 677642.7, 4097811.4, 1885.0, 113510100.0 NULL NULL NULL NULL 3.1928144, 0.8001475, 1.0000000, 4.0000000 NULL
transparency NULL NULL NULL 0.56396 0.2017189 0.0611 0.8563 NULL NULL
high NULL NULL NULL NULL NULL NULL NULL NULL 0.3213429, 0.4672724, 0.0000000, 1.0000000

answer:

(1 point) Bar plot code to arrive at the answer:

# Summarize data for the bar plot
plot_data <- df_loroguay %>%
  group_by(contract_purpose) %>%
  summarise(
    connected_contracts = sum(award_connected, na.rm = TRUE)
  )

# Create the bar plot
ggplot(plot_data, aes(x = contract_purpose, y = connected_contracts, fill = contract_purpose)) +
  geom_bar(stat = "identity", show.legend = FALSE) +
  labs(
    title = "Contracts Awarded to Connected Firms by Purpose",
    x = "Contract Purpose",
    y = "Number of Connected Contracts"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_bar()`).

answer:

(1 point) Create a nicely formatted bar plot showing the average value of contracts for contracts given to sole bidders and contracts given to firms that competed against other bidders.

code to arrive at the answer:

# Load necessary libraries
library(tidyverse)

# Ensure 'award_sole' and 'contract_value' are in the correct format
df_loroguay$award_sole <- as.factor(df_loroguay$award_sole)
df_loroguay$contract_value <- as.numeric(df_loroguay$contract_value)

# Summarize the data: calculate the average contract value for sole bidders and competitive bidders
plot_data <- df_loroguay %>%
  group_by(award_sole) %>%
  summarise(
    avg_contract_value = mean(contract_value, na.rm = TRUE)
  ) %>%
  mutate(
    award_sole = recode(award_sole, "0" = "Competitive Bidders", "1" = "Sole Bidders")
  )

# Create the bar plot
ggplot(plot_data, aes(x = award_sole, y = avg_contract_value, fill = award_sole)) +
  geom_bar(stat = "identity", show.legend = FALSE) +
  labs(
    title = "Average Contract Value for Sole vs. Competitive Bidders",
    x = "Bidder Type",
    y = "Average Contract Value (US Dollars)"
  ) +
  scale_fill_manual(values = c("Competitive Bidders" = "steelblue", "Sole Bidders" = "darkorange")) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
        axis.text.y = element_text(size = 12),
        axis.title = element_text(size = 14),
        plot.title = element_text(size = 16, face = "bold"))

answer:

(1 point)

code to arrive at the answer: Create a nicely formatted bar plot showing the average value of contracts for contracts given to connected bidders and contracts given to bidders without connections.

# Load necessary library
library(ggplot2)

# Aggregate the data to calculate average contract values
average_values <- df_loroguay %>%
  group_by(connection_status = ifelse(award_connected == 1, "Connected", "Not Connected")) %>%
  summarize(avg_contract_value = mean(contract_value, na.rm = TRUE))

# Create the bar plot
ggplot(average_values, aes(x = connection_status, y = avg_contract_value, fill = connection_status)) +
  geom_bar(stat = "identity", width = 0.6, alpha = 0.8) +
  scale_fill_manual(values = c("Connected" = "steelblue", "Not Connected" = "orange")) +
  labs(
    title = "Average Contract Value by Bidder Connection Status",
    x = "Connection Status",
    y = "Average Contract Value (USD)",
    fill = "Connection Status"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 16, face = "bold"),
    axis.title = element_text(size = 14),
    axis.text = element_text(size = 12),
    legend.position = "none"
  ) +
  geom_text(aes(label = round(avg_contract_value, 2)), vjust = -0.5, size = 4)

answer:

(0.5 points) Transform your data into a municipality-level dataset. Then, create a scatterplot that shows the relationship between transparency an the number of contracts that the municipality awarded to sole bidders.

code to arrive at the answer:

# Load necessary library
library(ggplot2)

# Create scatterplot
ggplot(df_loroguay, aes(x = transparency, y = award_sole)) +
  geom_point(color = "blue", size = 3, alpha = 0.7) +
  labs(
    title = "Transparency vs. Contracts Awarded to Sole Bidders",
    x = "Transparency",
    y = "Number of Contracts Awarded to Sole Bidders"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 16, face = "bold"),
    axis.title = element_text(size = 14),
    axis.text = element_text(size = 12)
  )
## Warning: Removed 163 rows containing missing values or values outside the scale range
## (`geom_point()`).

answer: