#Import the data
library(readr)
data <- read_csv("Pipeline data.csv")

#Data cleaning

# Find columns with only NA values
na_columns <- colSums(is.na(data)) == nrow(data)

# Remove columns with only NA values
data <- data[, !na_columns]

Descriptive statistics

# Select relevant columns for analysis (excluding dates)
relevant_columns <- c("Business Unit", "Zip Code", "State", "Project Type", "Project Type (Detail)",
                      "Pursuit", "In-Process", "In-Process & Anchor Open", "Complete",
                      "Total Project Costs...13", "Allocation Check")

# Subset the data
subset_data <- data[, relevant_columns]

# Calculate summary statistics
summary(subset_data)
##  Business Unit        Zip Code            State           Project Type      
##  Length:40          Length:40          Length:40          Length:40         
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  Project Type (Detail)    Pursuit         In-Process    
##  Length:40             Min.   :  89.0   Min.   :  62.0  
##  Class :character      1st Qu.: 293.0   1st Qu.: 328.2  
##  Mode  :character      Median : 669.5   Median : 427.5  
##                        Mean   : 978.5   Mean   : 455.1  
##                        3rd Qu.:1734.0   3rd Qu.: 571.5  
##                        Max.   :2100.0   Max.   :1086.0  
##  In-Process & Anchor Open    Complete   Total Project Costs...13
##  Min.   :-122.0           Min.   :365   Length:40               
##  1st Qu.:  14.5           1st Qu.:365   Class :character        
##  Median :  91.0           Median :365   Mode  :character        
##  Mean   : 129.1           Mean   :365                           
##  3rd Qu.: 198.8           3rd Qu.:365                           
##  Max.   : 688.0           Max.   :365                           
##  Allocation Check  
##  Length:40         
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
#Describe the data
library(psych)
describe(subset_data)
##                           vars  n   mean     sd median trimmed    mad  min  max
## Business Unit*               1 40  20.48  11.65   20.5   20.50  14.83    1   39
## Zip Code*                    2 40  19.50  10.86   19.5   19.59  14.08    1   36
## State*                       3 40   5.92   3.45    4.5    5.91   5.19    1   11
## Project Type*                4 40   1.73   0.45    2.0    1.78   0.00    1    2
## Project Type (Detail)*       5 40   2.67   0.97    3.0    2.69   1.48    1    5
## Pursuit                      6 40 978.45 733.53  669.5  951.78 702.75   89 2100
## In-Process                   7 40 455.12 221.66  427.5  437.34 195.70   62 1086
## In-Process & Anchor Open     8 40 129.05 161.11   91.0  105.28 134.18 -122  688
## Complete                     9 40 365.00   0.00  365.0  365.00   0.00  365  365
## Total Project Costs...13*   10 40  20.50  11.69   20.5   20.50  14.83    1   40
## Allocation Check*           11 40  20.50  11.69   20.5   20.50  14.83    1   40
##                           range  skew kurtosis     se
## Business Unit*               38 -0.01    -1.30   1.84
## Zip Code*                    35 -0.04    -1.32   1.72
## State*                       10  0.08    -1.47   0.55
## Project Type*                 1 -0.97    -1.08   0.07
## Project Type (Detail)*        4  0.01    -0.48   0.15
## Pursuit                    2011  0.28    -1.72 115.98
## In-Process                 1024  0.75     0.52  35.05
## In-Process & Anchor Open    810  1.42     2.14  25.47
## Complete                      0   NaN      NaN   0.00
## Total Project Costs...13*    39  0.00    -1.29   1.85
## Allocation Check*            39  0.00    -1.29   1.85
# View the first few rows of the data
head(subset_data)
## # A tibble: 6 × 11
##   `Business Unit` `Zip Code` State `Project Type` Project Type (Detail…¹ Pursuit
##   <chr>           <chr>      <chr> <chr>          <chr>                    <dbl>
## 1 6060901         90066      CA    Development    Development               1734
## 2 9008804         22310      VA    Redevelopment  Renovation                  89
## 3 8013112         33179      FL    Redevelopment  Anchor Repositioning      2008
## 4 8004301         33180      FL    Redevelopment  Significant Redevelop…    1734
## 5 66502           78751      TX    Redevelopment  Redevelopment             1124
## 6 8012831         94015      CA    Redevelopment  Anchor Repositioning      1734
## # ℹ abbreviated name: ¹​`Project Type (Detail)`
## # ℹ 5 more variables: `In-Process` <dbl>, `In-Process & Anchor Open` <dbl>,
## #   Complete <dbl>, `Total Project Costs...13` <chr>, `Allocation Check` <chr>

Exploratory Data Analysis

# Load required libraries
library(readr)
library(dplyr)
library(ggplot2)
library(tidyr)

# Explore the relationship between project type and completion status
completion_status <- data %>%
  group_by(`Project Type`, Complete) %>%
  summarize(count = n())
completion_status
## # A tibble: 2 × 3
## # Groups:   Project Type [2]
##   `Project Type` Complete count
##   <chr>             <dbl> <int>
## 1 Development         365    11
## 2 Redevelopment       365    29
# Count of projects by project type
project_type_counts <- data %>%
  group_by(`Project Type`) %>%
  summarize(count = n())


# Correlation matrix for numeric variables
numeric_cols <- select_if(data, is.numeric)
cor_matrix <- cor(numeric_cols)
print(cor_matrix)
##                              Pursuit In-Process In-Process & Anchor Open
## Pursuit                   1.00000000  0.2715440              -0.00807444
## In-Process                0.27154399  1.0000000               0.58602381
## In-Process & Anchor Open -0.00807444  0.5860238               1.00000000
## Complete                          NA         NA                       NA
##                          Complete
## Pursuit                        NA
## In-Process                     NA
## In-Process & Anchor Open       NA
## Complete                        1
# Visualization

# Count the number of projects by project type
project_type_counts <- data %>% count(`Project Type`, sort = TRUE)

# Bar plot of project type counts
ggplot(project_type_counts, aes(x = `Project Type`, y = n)) +
  geom_bar(stat = "identity", fill = "cornflowerblue") +
  labs(title = "Project Type Counts", x = "Project Type", y = "Count") +
  theme_bw()

# Reshape the data to long format
data_long <- tidyr::pivot_longer(data, cols = c(`Complete`, `In-Process`, `In-Process & Anchor Open`), names_to = "Status", values_to = "Count")


# Create a bar plot of completions, in-process, and in-process & anchor open by states
ggplot(data_long, aes(x = State, y = Count, fill = Status)) +
  geom_bar(stat = "identity", position = "stack") +
  labs(title = "Status of Projects by States", x = "State", y = "Count") +
  theme_bw() +
  facet_wrap(~ Status, nrow = 1) +
  theme(axis.text.x = element_text(angle = 78, hjust = 1))

Business cycles. Does the expenditure tend to vary with the level of economic activity or are they independent of cycles?

# Create a bar plot of project costs (economic activity) by state
ggplot(data, aes(x = State, y = `Total Project Costs...13`)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  labs(title = "Project Costs by States Economic Activity", x = "State", y = "Project Costs") +
  theme_bw()+ theme(axis.text.y = element_blank(), axis.ticks.y = element_blank())

Trends. Are locations or project types affecting expenditure tend?

library(ggplot2)

ggplot(data, aes(x = State, y = `Total Project Costs...13`, color = `Project Type`)) +
  geom_point() +
  labs(x = "State", y = "Total Project Costs", title = "Relationship between State and Total Project Costs") +
  theme_bw()

Outliers and historical anomalies. Does the data contain any extreme values that need to be explained? It could be that these represent highly anomalous events that don’t add to the predictive power of the data set.

# Numeric columns to analyze
numeric_columns <- names(data)[sapply(data, is.numeric)]


# Visualize data using box plots
boxplot_data <- data[, numeric_columns]
boxplot(boxplot_data, main = "Boxplot of Numeric Variables")

# Calculate z-scores for all numerical variables
z_scores <- scale(data[, sapply(data, is.numeric)])

# Identify outliers using z-scores
z_scores <- as.data.frame(scale(data[, numeric_columns]))
outliers <- which(abs(z_scores) > 3)
num_outliers <- length(outliers)  # Count of outliers
print(paste("Number of outliers:", num_outliers))
## [1] "Number of outliers: 1"

Relationships between variables. Are there important relationships between variables that could aid in forecasting? Based on your exploratory analysis and understanding of the data, you identify and propose potential methods (regression, time series forecasting, machine learning, etc.) to provide solutions to the business problem.

# Perform regression analysis
lm_model <- lm(Complete ~ `Project Type` + `Total Project Costs...13`, data = data)
summary(lm_model)
## 
## Call:
## lm(formula = Complete ~ `Project Type` + `Total Project Costs...13`, 
##     data = data)
## 
## Residuals:
## ALL 40 residuals are 0: no residual degrees of freedom!
## 
## Coefficients: (1 not defined because of singularities)
##                                         Estimate Std. Error t value Pr(>|t|)
## (Intercept)                            3.650e+02        NaN     NaN      NaN
## `Project Type`Redevelopment           -7.334e-29        NaN     NaN      NaN
## `Total Project Costs...13`$1,486,691  -1.237e-29        NaN     NaN      NaN
## `Total Project Costs...13`$1,653,000  -5.510e-29        NaN     NaN      NaN
## `Total Project Costs...13`$1,677,004  -1.037e-28        NaN     NaN      NaN
## `Total Project Costs...13`$11,011,311  2.473e-29        NaN     NaN      NaN
## `Total Project Costs...13`$11,058,903 -3.425e-28        NaN     NaN      NaN
## `Total Project Costs...13`$11,140,000 -5.049e-29        NaN     NaN      NaN
## `Total Project Costs...13`$11,150,028  6.725e-29        NaN     NaN      NaN
## `Total Project Costs...13`$11,500,000  4.024e-29        NaN     NaN      NaN
## `Total Project Costs...13`$11,631,425  1.610e-29        NaN     NaN      NaN
## `Total Project Costs...13`$12,085,081  5.204e-28        NaN     NaN      NaN
## `Total Project Costs...13`$12,839,060 -2.752e-28        NaN     NaN      NaN
## `Total Project Costs...13`$16,364,000 -2.917e-28        NaN     NaN      NaN
## `Total Project Costs...13`$17,279,113 -6.419e-29        NaN     NaN      NaN
## `Total Project Costs...13`$18,884,506 -4.363e-28        NaN     NaN      NaN
## `Total Project Costs...13`$2,245,522  -2.304e-28        NaN     NaN      NaN
## `Total Project Costs...13`$20,000,000 -4.055e-28        NaN     NaN      NaN
## `Total Project Costs...13`$21,885,063  1.361e-28        NaN     NaN      NaN
## `Total Project Costs...13`$28,265,750  7.190e-13        NaN     NaN      NaN
## `Total Project Costs...13`$3,000,000   3.173e-44        NaN     NaN      NaN
## `Total Project Costs...13`$3,055,370  -1.090e-44        NaN     NaN      NaN
## `Total Project Costs...13`$3,213,526   1.015e-44        NaN     NaN      NaN
## `Total Project Costs...13`$3,246,790  -1.771e-28        NaN     NaN      NaN
## `Total Project Costs...13`$3,377,806   4.667e-44        NaN     NaN      NaN
## `Total Project Costs...13`$3,700,000   1.818e-28        NaN     NaN      NaN
## `Total Project Costs...13`$3,800,000   2.058e-44        NaN     NaN      NaN
## `Total Project Costs...13`$4,000,000  -4.852e-29        NaN     NaN      NaN
## `Total Project Costs...13`$4,673,340   8.918e-45        NaN     NaN      NaN
## `Total Project Costs...13`$440,000     4.986e-29        NaN     NaN      NaN
## `Total Project Costs...13`$45,000,000  1.367e-29        NaN     NaN      NaN
## `Total Project Costs...13`$49,066,000  1.367e-29        NaN     NaN      NaN
## `Total Project Costs...13`$5,000,000   4.686e-44        NaN     NaN      NaN
## `Total Project Costs...13`$6,400,000  -4.068e-44        NaN     NaN      NaN
## `Total Project Costs...13`$6,450,000  -1.788e-28        NaN     NaN      NaN
## `Total Project Costs...13`$65,000,000  1.367e-29        NaN     NaN      NaN
## `Total Project Costs...13`$7,390,805  -3.549e-44        NaN     NaN      NaN
## `Total Project Costs...13`$7,460,362  -2.664e-45        NaN     NaN      NaN
## `Total Project Costs...13`$7,803,702   1.716e-45        NaN     NaN      NaN
## `Total Project Costs...13`$8,260,000          NA         NA      NA       NA
## `Total Project Costs...13`$9,900,546   2.809e-29        NaN     NaN      NaN
## 
## Residual standard error: NaN on 0 degrees of freedom
## Multiple R-squared:    NaN,  Adjusted R-squared:    NaN 
## F-statistic:   NaN on 39 and 0 DF,  p-value: NA

How does Monthly capital spend trend differently by project size, project duration or state

library(ggplot2)

# Subset the relevant columns from the data
subset_data <- data[, c("Total Project Costs...13", "Project Type", "State")]

# Remove rows with missing values in Total Project Costs...13
subset_data <- subset_data[!is.na(subset_data$`Total Project Costs...13`), ]

# Create a box plot to compare Total Project Costs by Project Type
ggplot(subset_data, aes(x = `Project Type`, y = `Total Project Costs...13`)) +
  geom_boxplot() +
  labs(x = "Project Type", y = "Total Project Costs",
       title = "Comparison of Total Project Costs by Project Type") +
  theme_bw()

# Create a bar plot to compare Total Project Costs by State
ggplot(subset_data, aes(x = `State`, y = `Total Project Costs...13`, fill = `State`)) +
  geom_bar(stat = "identity") +
  labs(x = "State", y = "Total Project Costs",
       title = "Comparison of Total Project Costs by State") +
  theme_bw()+
  theme(axis.text.y = element_blank(), axis.ticks.y = element_blank())