title: “NCB Project Analysis” author: “Jitender Singh, Srija Sama, Sanjani Ksheerasagar” date: “2025-11-30” output: word_document: default html_document: default —


library(readxl)
library(dplyr)
library(ggplot2)
library(corrplot)
library(cluster)
library(tidyr)
data <- read_excel("C:/Users/jithu/Downloads/W07404-XLS-ENG.xlsx", sheet = "data")

data$bloc <- factor(data$bloc, labels = c("Non-Metro", "Metro"))
data$ccon <- factor(data$ccon, labels = c("Low", "High"))

summary(data)
##       bnum            badd                  bloc        bsize      
##  Min.   :  1.00   Length:128         Non-Metro:43   Min.   : 5.00  
##  1st Qu.: 32.75   Class :character   Metro    :85   1st Qu.:11.00  
##  Median : 64.50   Mode  :character                  Median :16.00  
##  Mean   : 64.50                                     Mean   :17.42  
##  3rd Qu.: 96.25                                     3rd Qu.:21.25  
##  Max.   :128.00                                     Max.   :55.00  
##      ecuso           equal           einvol           etra      
##  Min.   :3.030   Min.   :2.610   Min.   :2.740   Min.   :2.510  
##  1st Qu.:3.520   1st Qu.:3.405   1st Qu.:3.260   1st Qu.:3.107  
##  Median :3.750   Median :3.590   Median :3.430   Median :3.260  
##  Mean   :3.758   Mean   :3.589   Mean   :3.441   Mean   :3.258  
##  3rd Qu.:3.973   3rd Qu.:3.775   3rd Qu.:3.650   3rd Qu.:3.420  
##  Max.   :4.700   Max.   :4.300   Max.   :4.110   Max.   :4.030  
##      ecomm           eteam            eeng            eitl      
##  Min.   :2.180   Min.   :2.600   Min.   :2.500   Min.   :1.400  
##  1st Qu.:3.050   1st Qu.:3.390   1st Qu.:3.083   1st Qu.:2.138  
##  Median :3.220   Median :3.580   Median :3.270   Median :2.330  
##  Mean   :3.238   Mean   :3.543   Mean   :3.275   Mean   :2.422  
##  3rd Qu.:3.393   3rd Qu.:3.720   3rd Qu.:3.440   3rd Qu.:2.715  
##  Max.   :3.900   Max.   :4.050   Max.   :4.400   Max.   :3.650  
##       eben           cserq           cbrtel           cbr       
##  Min.   :2.830   Min.   :2.850   Min.   :3.910   Min.   :3.710  
##  1st Qu.:3.550   1st Qu.:3.200   1st Qu.:4.160   1st Qu.:4.030  
##  Median :3.790   Median :3.315   Median :4.260   Median :4.180  
##  Mean   :3.745   Mean   :3.336   Mean   :4.268   Mean   :4.179  
##  3rd Qu.:3.922   3rd Qu.:3.440   3rd Qu.:4.372   3rd Qu.:4.310  
##  Max.   :4.400   Max.   :3.910   Max.   :4.600   Max.   :4.660  
##      cbrpb            cloy        ccon        teltr           prod      
##  Min.   :3.770   Min.   :3.05   Low :64   Min.   :1660   Min.   :166.0  
##  1st Qu.:4.050   1st Qu.:3.65   High:64   1st Qu.:3521   1st Qu.:252.5  
##  Median :4.130   Median :3.77             Median :4046   Median :292.0  
##  Mean   :4.144   Mean   :3.76             Mean   :3977   Mean   :290.5  
##  3rd Qu.:4.242   3rd Qu.:3.86             3rd Qu.:4375   3rd Qu.:324.2  
##  Max.   :4.530   Max.   :4.22             Max.   :5637   Max.   :441.0
data %>%
select(eeng, etra, ecomm, eteam, equal, ecuso) %>%
pivot_longer(cols = everything(), names_to = "Variable", values_to = "Score") %>%
ggplot(aes(x = Score, fill = Variable)) +
geom_histogram(bins = 10, alpha = 0.6) +
theme_minimal() +
labs(title = "Distribution of Employee Engagement Metrics")

corr_vars <- c("ecuso","equal","einvol","etra","ecomm","eteam","eeng",
"cserq","cbrtel","cbr","cbrpb","cloy","teltr","prod")

corr_matrix <- cor(data[, corr_vars], use = "complete.obs")
corrplot(corr_matrix, method = "color", type = "upper")

model_prod <- lm(prod ~ eeng + etra + ecomm + eteam + cserq + cloy, data = data)
summary(model_prod)
## 
## Call:
## lm(formula = prod ~ eeng + etra + ecomm + eteam + cserq + cloy, 
##     data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -113.087  -37.717   -2.675   31.589  149.468 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)   84.582    111.731   0.757   0.4505  
## eeng          -8.302     18.075  -0.459   0.6468  
## etra         -22.639     27.620  -0.820   0.4140  
## ecomm         41.894     24.855   1.686   0.0945 .
## eteam         14.808     22.343   0.663   0.5087  
## cserq        -59.619     59.129  -1.008   0.3153  
## cloy          84.489     57.412   1.472   0.1437  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 51.73 on 121 degrees of freedom
## Multiple R-squared:  0.06078,    Adjusted R-squared:  0.0142 
## F-statistic: 1.305 on 6 and 121 DF,  p-value: 0.26
cluster_vars <- data[, c("ecuso","equal","etra","ecomm","eteam","eeng","cloy","prod")]
cl_scaled <- scale(cluster_vars)
set.seed(123)
k3 <- kmeans(cl_scaled, centers = 3)
data$cluster <- as.factor(k3$cluster)

ggplot(data, aes(x = eeng, y = cloy, color = cluster)) +
geom_point(size = 3) +
theme_minimal() +
labs(title = "Cluster Analysis")

write.csv(data, “C:/Users/jithu/Downloads/NCB_cleaned_clusters.csv”, row.names = FALSE)