#Import the data
library(readr)
data <- read_csv("Pipeline data.csv")
#Data cleaning
# Find columns with only NA values
na_columns <- colSums(is.na(data)) == nrow(data)
# Remove columns with only NA values
data <- data[, !na_columns]
# Select relevant columns for analysis (excluding dates)
relevant_columns <- c("Business Unit", "Zip Code", "State", "Project Type", "Project Type (Detail)",
"Pursuit", "In-Process", "In-Process & Anchor Open", "Complete",
"Total Project Costs...13", "Allocation Check")
# Subset the data
subset_data <- data[, relevant_columns]
# Calculate summary statistics
summary(subset_data)
## Business Unit Zip Code State Project Type
## Length:40 Length:40 Length:40 Length:40
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## Project Type (Detail) Pursuit In-Process
## Length:40 Min. : 89.0 Min. : 62.0
## Class :character 1st Qu.: 293.0 1st Qu.: 328.2
## Mode :character Median : 669.5 Median : 427.5
## Mean : 978.5 Mean : 455.1
## 3rd Qu.:1734.0 3rd Qu.: 571.5
## Max. :2100.0 Max. :1086.0
## In-Process & Anchor Open Complete Total Project Costs...13
## Min. :-122.0 Min. :365 Length:40
## 1st Qu.: 14.5 1st Qu.:365 Class :character
## Median : 91.0 Median :365 Mode :character
## Mean : 129.1 Mean :365
## 3rd Qu.: 198.8 3rd Qu.:365
## Max. : 688.0 Max. :365
## Allocation Check
## Length:40
## Class :character
## Mode :character
##
##
##
#Describe the data
library(psych)
describe(subset_data)
## vars n mean sd median trimmed mad min max
## Business Unit* 1 40 20.48 11.65 20.5 20.50 14.83 1 39
## Zip Code* 2 40 19.50 10.86 19.5 19.59 14.08 1 36
## State* 3 40 5.92 3.45 4.5 5.91 5.19 1 11
## Project Type* 4 40 1.73 0.45 2.0 1.78 0.00 1 2
## Project Type (Detail)* 5 40 2.67 0.97 3.0 2.69 1.48 1 5
## Pursuit 6 40 978.45 733.53 669.5 951.78 702.75 89 2100
## In-Process 7 40 455.12 221.66 427.5 437.34 195.70 62 1086
## In-Process & Anchor Open 8 40 129.05 161.11 91.0 105.28 134.18 -122 688
## Complete 9 40 365.00 0.00 365.0 365.00 0.00 365 365
## Total Project Costs...13* 10 40 20.50 11.69 20.5 20.50 14.83 1 40
## Allocation Check* 11 40 20.50 11.69 20.5 20.50 14.83 1 40
## range skew kurtosis se
## Business Unit* 38 -0.01 -1.30 1.84
## Zip Code* 35 -0.04 -1.32 1.72
## State* 10 0.08 -1.47 0.55
## Project Type* 1 -0.97 -1.08 0.07
## Project Type (Detail)* 4 0.01 -0.48 0.15
## Pursuit 2011 0.28 -1.72 115.98
## In-Process 1024 0.75 0.52 35.05
## In-Process & Anchor Open 810 1.42 2.14 25.47
## Complete 0 NaN NaN 0.00
## Total Project Costs...13* 39 0.00 -1.29 1.85
## Allocation Check* 39 0.00 -1.29 1.85
# View the first few rows of the data
head(subset_data)
## # A tibble: 6 × 11
## `Business Unit` `Zip Code` State `Project Type` Project Type (Detail…¹ Pursuit
## <chr> <chr> <chr> <chr> <chr> <dbl>
## 1 6060901 90066 CA Development Development 1734
## 2 9008804 22310 VA Redevelopment Renovation 89
## 3 8013112 33179 FL Redevelopment Anchor Repositioning 2008
## 4 8004301 33180 FL Redevelopment Significant Redevelop… 1734
## 5 66502 78751 TX Redevelopment Redevelopment 1124
## 6 8012831 94015 CA Redevelopment Anchor Repositioning 1734
## # ℹ abbreviated name: ¹​`Project Type (Detail)`
## # ℹ 5 more variables: `In-Process` <dbl>, `In-Process & Anchor Open` <dbl>,
## # Complete <dbl>, `Total Project Costs...13` <chr>, `Allocation Check` <chr>
# Load required libraries
library(readr)
library(dplyr)
library(ggplot2)
library(tidyr)
# Explore the relationship between project type and completion status
completion_status <- data %>%
group_by(`Project Type`, Complete) %>%
summarize(count = n())
completion_status
## # A tibble: 2 × 3
## # Groups: Project Type [2]
## `Project Type` Complete count
## <chr> <dbl> <int>
## 1 Development 365 11
## 2 Redevelopment 365 29
# Count of projects by project type
project_type_counts <- data %>%
group_by(`Project Type`) %>%
summarize(count = n())
# Correlation matrix for numeric variables
numeric_cols <- select_if(data, is.numeric)
cor_matrix <- cor(numeric_cols)
print(cor_matrix)
## Pursuit In-Process In-Process & Anchor Open
## Pursuit 1.00000000 0.2715440 -0.00807444
## In-Process 0.27154399 1.0000000 0.58602381
## In-Process & Anchor Open -0.00807444 0.5860238 1.00000000
## Complete NA NA NA
## Complete
## Pursuit NA
## In-Process NA
## In-Process & Anchor Open NA
## Complete 1
# Visualization
# Count the number of projects by project type
project_type_counts <- data %>% count(`Project Type`, sort = TRUE)
# Bar plot of project type counts
ggplot(project_type_counts, aes(x = `Project Type`, y = n)) +
geom_bar(stat = "identity", fill = "cornflowerblue") +
labs(title = "Project Type Counts", x = "Project Type", y = "Count") +
theme_bw()
# Reshape the data to long format
data_long <- tidyr::pivot_longer(data, cols = c(`Complete`, `In-Process`, `In-Process & Anchor Open`), names_to = "Status", values_to = "Count")
# Create a bar plot of completions, in-process, and in-process & anchor open by states
ggplot(data_long, aes(x = State, y = Count, fill = Status)) +
geom_bar(stat = "identity", position = "stack") +
labs(title = "Status of Projects by States", x = "State", y = "Count") +
theme_bw() +
facet_wrap(~ Status, nrow = 1) +
theme(axis.text.x = element_text(angle = 78, hjust = 1))
Business cycles. Does the expenditure tend to vary with the level of economic activity or are they independent of cycles?
# Create a bar plot of project costs (economic activity) by state
ggplot(data, aes(x = State, y = `Total Project Costs...13`)) +
geom_bar(stat = "identity", fill = "steelblue") +
labs(title = "Project Costs by States Economic Activity", x = "State", y = "Project Costs") +
theme_bw()+ theme(axis.text.y = element_blank(), axis.ticks.y = element_blank())
Trends. Are locations or project types affecting expenditure tend?
library(ggplot2)
ggplot(data, aes(x = State, y = `Total Project Costs...13`, color = `Project Type`)) +
geom_point() +
labs(x = "State", y = "Total Project Costs", title = "Relationship between State and Total Project Costs") +
theme_bw()
Outliers and historical anomalies. Does the data contain any extreme values that need to be explained? It could be that these represent highly anomalous events that don’t add to the predictive power of the data set.
# Numeric columns to analyze
numeric_columns <- names(data)[sapply(data, is.numeric)]
# Visualize data using box plots
boxplot_data <- data[, numeric_columns]
boxplot(boxplot_data, main = "Boxplot of Numeric Variables")
# Calculate z-scores for all numerical variables
z_scores <- scale(data[, sapply(data, is.numeric)])
# Identify outliers using z-scores
z_scores <- as.data.frame(scale(data[, numeric_columns]))
outliers <- which(abs(z_scores) > 3)
num_outliers <- length(outliers) # Count of outliers
print(paste("Number of outliers:", num_outliers))
## [1] "Number of outliers: 1"
Relationships between variables. Are there important relationships between variables that could aid in forecasting? Based on your exploratory analysis and understanding of the data, you identify and propose potential methods (regression, time series forecasting, machine learning, etc.) to provide solutions to the business problem.
# Perform regression analysis
lm_model <- lm(Complete ~ `Project Type` + `Total Project Costs...13`, data = data)
summary(lm_model)
##
## Call:
## lm(formula = Complete ~ `Project Type` + `Total Project Costs...13`,
## data = data)
##
## Residuals:
## ALL 40 residuals are 0: no residual degrees of freedom!
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.650e+02 NaN NaN NaN
## `Project Type`Redevelopment -7.334e-29 NaN NaN NaN
## `Total Project Costs...13`$1,486,691 -1.237e-29 NaN NaN NaN
## `Total Project Costs...13`$1,653,000 -5.510e-29 NaN NaN NaN
## `Total Project Costs...13`$1,677,004 -1.037e-28 NaN NaN NaN
## `Total Project Costs...13`$11,011,311 2.473e-29 NaN NaN NaN
## `Total Project Costs...13`$11,058,903 -3.425e-28 NaN NaN NaN
## `Total Project Costs...13`$11,140,000 -5.049e-29 NaN NaN NaN
## `Total Project Costs...13`$11,150,028 6.725e-29 NaN NaN NaN
## `Total Project Costs...13`$11,500,000 4.024e-29 NaN NaN NaN
## `Total Project Costs...13`$11,631,425 1.610e-29 NaN NaN NaN
## `Total Project Costs...13`$12,085,081 5.204e-28 NaN NaN NaN
## `Total Project Costs...13`$12,839,060 -2.752e-28 NaN NaN NaN
## `Total Project Costs...13`$16,364,000 -2.917e-28 NaN NaN NaN
## `Total Project Costs...13`$17,279,113 -6.419e-29 NaN NaN NaN
## `Total Project Costs...13`$18,884,506 -4.363e-28 NaN NaN NaN
## `Total Project Costs...13`$2,245,522 -2.304e-28 NaN NaN NaN
## `Total Project Costs...13`$20,000,000 -4.055e-28 NaN NaN NaN
## `Total Project Costs...13`$21,885,063 1.361e-28 NaN NaN NaN
## `Total Project Costs...13`$28,265,750 7.190e-13 NaN NaN NaN
## `Total Project Costs...13`$3,000,000 3.173e-44 NaN NaN NaN
## `Total Project Costs...13`$3,055,370 -1.090e-44 NaN NaN NaN
## `Total Project Costs...13`$3,213,526 1.015e-44 NaN NaN NaN
## `Total Project Costs...13`$3,246,790 -1.771e-28 NaN NaN NaN
## `Total Project Costs...13`$3,377,806 4.667e-44 NaN NaN NaN
## `Total Project Costs...13`$3,700,000 1.818e-28 NaN NaN NaN
## `Total Project Costs...13`$3,800,000 2.058e-44 NaN NaN NaN
## `Total Project Costs...13`$4,000,000 -4.852e-29 NaN NaN NaN
## `Total Project Costs...13`$4,673,340 8.918e-45 NaN NaN NaN
## `Total Project Costs...13`$440,000 4.986e-29 NaN NaN NaN
## `Total Project Costs...13`$45,000,000 1.367e-29 NaN NaN NaN
## `Total Project Costs...13`$49,066,000 1.367e-29 NaN NaN NaN
## `Total Project Costs...13`$5,000,000 4.686e-44 NaN NaN NaN
## `Total Project Costs...13`$6,400,000 -4.068e-44 NaN NaN NaN
## `Total Project Costs...13`$6,450,000 -1.788e-28 NaN NaN NaN
## `Total Project Costs...13`$65,000,000 1.367e-29 NaN NaN NaN
## `Total Project Costs...13`$7,390,805 -3.549e-44 NaN NaN NaN
## `Total Project Costs...13`$7,460,362 -2.664e-45 NaN NaN NaN
## `Total Project Costs...13`$7,803,702 1.716e-45 NaN NaN NaN
## `Total Project Costs...13`$8,260,000 NA NA NA NA
## `Total Project Costs...13`$9,900,546 2.809e-29 NaN NaN NaN
##
## Residual standard error: NaN on 0 degrees of freedom
## Multiple R-squared: NaN, Adjusted R-squared: NaN
## F-statistic: NaN on 39 and 0 DF, p-value: NA
library(ggplot2)
# Subset the relevant columns from the data
subset_data <- data[, c("Total Project Costs...13", "Project Type", "State")]
# Remove rows with missing values in Total Project Costs...13
subset_data <- subset_data[!is.na(subset_data$`Total Project Costs...13`), ]
# Create a box plot to compare Total Project Costs by Project Type
ggplot(subset_data, aes(x = `Project Type`, y = `Total Project Costs...13`)) +
geom_boxplot() +
labs(x = "Project Type", y = "Total Project Costs",
title = "Comparison of Total Project Costs by Project Type") +
theme_bw()
# Create a bar plot to compare Total Project Costs by State
ggplot(subset_data, aes(x = `State`, y = `Total Project Costs...13`, fill = `State`)) +
geom_bar(stat = "identity") +
labs(x = "State", y = "Total Project Costs",
title = "Comparison of Total Project Costs by State") +
theme_bw()+
theme(axis.text.y = element_blank(), axis.ticks.y = element_blank())