Instructions:
library(readxl)
Road= read.csv("C:\\Users\\etfie\\OneDrive\\Data Visualization\\Road construction bids.csv")
There are 235 bids in this data set.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
summarize(Road, mean_ActualCost = mean(Actual_Cost, na.rm = TRUE),
mean_DoTEstimate = mean(DoT_Estimate, na.rm = TRUE))
## mean_ActualCost mean_DoTEstimate
## 1 1268.715 1347.077
library(dplyr)
summarize(Road,
mean_ActualCost = mean(Actual_Cost, na.rm = TRUE),
mean_DoTEstimate = mean(DoT_Estimate, na.rm = TRUE))
## mean_ActualCost mean_DoTEstimate
## 1 1268.715 1347.077
difference
Road$difference <- Road$DoT_Estimate - Road$Actual_Cost
percent_difference = difference/cost
Road$percent_difference <- (Road$difference/Road$Actual_Cost)*100
budget = projects whose actual cost is more than the estimate are labeled as “Over Budget”, otherwise “Under Budget”.
Road$budget <- ifelse(Road$Actual_Cost > Road$DoT_Estimate, "Over Budget", "Under Budget")
large and small projects
Road$Project_Size <- ifelse(Road$Actual_Cost > 1000, "Large", "Small")
library(dplyr)
LargeProjects <- group_by(Road, Project_Size == "Large")
SmallProjects <- group_by(Road, Project_Size == "Small")
Mean_Large <- summarize(LargeProjects, Mean_Large = mean(difference, na.rm = TRUE))
Mean_Small <- summarize(SmallProjects, Mean_Small = mean(difference, na.rm = TRUE))
Mean_Large
## # A tibble: 2 × 2
## `Project_Size == "Large"` Mean_Large
## <lgl> <dbl>
## 1 FALSE 29.8
## 2 TRUE 184.
Percent_Mean_Large <-summarize(LargeProjects, Mean_Large = mean(percent_difference, na.rm = TRUE))
Percent_Mean_Small <- summarize(SmallProjects, Mean_Small = mean(percent_difference, na.rm = TRUE))
PDBLSP = Percent_Mean_Large - Percent_Mean_Small
PDBLSP
## Project_Size == "Large" Mean_Large
## 1 0 5.844409
## 2 0 -5.844409
Percent_Mean_Large
## # A tibble: 2 × 2
## `Project_Size == "Large"` Mean_Large
## <lgl> <dbl>
## 1 FALSE 10.4
## 2 TRUE 4.58
Percent_Mean_Small
## # A tibble: 2 × 2
## `Project_Size == "Small"` Mean_Small
## <lgl> <dbl>
## 1 FALSE 4.58
## 2 TRUE 10.4
library(ggplot2)
library(dplyr)
ggplot(Road, aes(x = Actual_Cost, y = difference, color = Project_Size)) +
geom_point(alpha = 0.6, size = 3) +
geom_smooth(method = "lm", se = TRUE, linetype = "dashed", color = "black") +
scale_color_manual(values = c("Large" = "orchid", "Small" = "sienna")) +
labs( title = "Difference vs. Actual Cost",
subtitle = "Difference Compared to Actual Cost",
x = "Actual Cost",
y = "Difference (Estimate - Actual Cost)",
color = "Project Size") +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, face = "bold"),
plot.subtitle = element_text(hjust = 0.5)
) +
scale_x_continuous(labels = scales::comma) +
scale_y_continuous(labels = scales::comma)
## `geom_smooth()` using formula = 'y ~ x'
library(ggplot2)
ggplot(Road, aes(x = percent_difference, fill = Project_Size)) +
geom_histogram(position = "dodge", bins = 30, alpha = 0.7, color = "black") +
scale_fill_manual(values = c("Large" = "violetred", "Small" = "yellowgreen")) +
labs(title = "Percent Difference vs Actual Cost",
subtitle = "Grouped by Project Size",
x = "Percent Difference %",
y = "Frequency",
fill = "Project Size") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, face = "bold"),
plot.subtitle = element_text(hjust = 0.5)) +
scale_x_continuous(labels = scales::percent_format(scale = 1))
In the graph on question 7 I noticed that the larger projects costed more that the smaller projects (as expected). I also noticed that the smaller projects seemed to be more frequently in the same part of the graph showing that smaller projects are more consistent between their estimated cost-actual cost and their actual cost, whereas the larger projects were not as consistent with their difference and actual cost.
In the graph on question 8 I noticed that the larger projects had more of a percent difference between their estimated cost and their actual cost than the smaller projects did. The larger projects have a larger frequency, however, it seems that the smaller projects don’t have as big of a percent difference as the larger projects do.