library(dplyr)
## Warning: package 'dplyr' was built under R version 4.5.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.3
campus<-read.csv("advanced_smart_campus_7500_rows.csv")

1 Level 1: UNDERSTANDING THE DATA

1.1 Question 1.1: What is the structure of the dataset?

str(campus)
## 'data.frame':    7500 obs. of  14 variables:
##  $ Room_ID      : chr  "R1" "R2" "R3" "R4" ...
##  $ Capacity     : int  139 96 122 128 132 47 113 136 87 116 ...
##  $ Students_Used: int  139 23 39 13 132 47 28 90 87 110 ...
##  $ Electricity  : int  11 30 12 16 18 12 47 54 61 51 ...
##  $ Time_Slot    : chr  "Afternoon" "Morning" "Afternoon" "Evening" ...
##  $ Date         : chr  "2024-01-01 00:00:00" "2024-01-01 01:00:00" "2024-01-01 02:00:00" "2024-01-01 03:00:00" ...
##  $ Department   : chr  "CE" "CS" "CS" "ME" ...
##  $ Floor        : int  1 1 5 4 3 3 2 7 7 6 ...
##  $ Utilization  : num  100 24 32 10.2 100 ...
##  $ Efficiency   : num  12.636 0.767 3.25 0.812 7.333 ...
##  $ Day          : chr  "2024-01-01" "2024-01-01" "2024-01-01" "2024-01-01" ...
##  $ Hour         : int  0 1 2 3 4 5 6 7 8 9 ...
##  $ Is_Weekend   : chr  "False" "False" "False" "False" ...
##  $ Load_Level   : chr  "High" "Low" "Low" "Low" ...
dim(campus)
## [1] 7500   14
head(campus)
##   Room_ID Capacity Students_Used Electricity Time_Slot                Date
## 1      R1      139           139          11 Afternoon 2024-01-01 00:00:00
## 2      R2       96            23          30   Morning 2024-01-01 01:00:00
## 3      R3      122            39          12 Afternoon 2024-01-01 02:00:00
## 4      R4      128            13          16   Evening 2024-01-01 03:00:00
## 5      R5      132           132          18   Morning 2024-01-01 04:00:00
## 6      R6       47            47          12 Afternoon 2024-01-01 05:00:00
##   Department Floor Utilization Efficiency        Day Hour Is_Weekend Load_Level
## 1         CE     1   100.00000 12.6363636 2024-01-01    0      False       High
## 2         CS     1    23.95833  0.7666667 2024-01-01    1      False        Low
## 3         CS     5    31.96721  3.2500000 2024-01-01    2      False        Low
## 4         ME     4    10.15625  0.8125000 2024-01-01    3      False        Low
## 5         CE     3   100.00000  7.3333333 2024-01-01    4      False       High
## 6        ECE     3   100.00000  3.9166667 2024-01-01    5      False       High

Answer: The data set contains multiple observations related to campus operations including energy usage, attendance, and facilities.

1.2 Question 1.2: Are there any missing values in the dataset?

colSums(is.na(campus))
##       Room_ID      Capacity Students_Used   Electricity     Time_Slot 
##             0             0             0             0             0 
##          Date    Department         Floor   Utilization    Efficiency 
##             0             0             0             0             0 
##           Day          Hour    Is_Weekend    Load_Level 
##             0             0             0             0
sum(is.na(campus))
## [1] 0

Answer: All values are 0 → no missing values Data set is complete and clean

1.3 Question 1.3: What is the average and median utilization?

mean(campus$Utilization, na.rm = TRUE)
## [1] 74.32348
median(campus$Utilization, na.rm = TRUE)
## [1] 88.80215

Answer: Mean and median were calculated to understand central tendency. Mean gives the average utilization, while median provides the middle value, which is less affected by outlier. Missing values were ignored using na.rm = TRUE. Shows overall efficiency of room usage.

1.4 Question 1.4: What is the range of capacity and electricity usage?

range(campus$Capacity, na.rm = TRUE)
## [1]  30 149
range(campus$Electricity, na.rm = TRUE)
## [1]  5 79

Answer: Range shows the lowest and highest values, indicating the spread of data. Useful for understanding data spread

1.5 Question 1.5: Which departments appear most frequently?

sort(table(campus$Department), decreasing = TRUE)
## 
##   IT   ME  ECE   CE  BBA   CS 
## 1322 1293 1259 1226 1212 1188

Answer: The frequency of each department was calculated using table(), and the results were sorted in descending order to identify which departments have the highest and lowest number of records.

2 Level 2: DATA CLEANING & MANIPULATION

2.1 Question 2.1: Remove duplicate records

campus <- read.csv("advanced_smart_campus_7500_rows.csv")

sum(duplicated(campus))
## [1] 0
campus <- campus[!duplicated(campus), ]

dim(campus)
## [1] 7500   14

Answer: The data set was checked for duplicate records using the duplicated() function. The result shows that there are no duplicate rows in the data set. After applying duplicate removal, the number of rows remains unchanged, confirming that the data set is already clean.

2.2 Question 2.2:Question 2.2: Top 10 rooms with highest utilization

library(dplyr)

campus %>%
  arrange(desc(Utilization)) %>%
  select(Room_ID, Department, Utilization, Efficiency) %>%
  head(10)
##    Room_ID Department Utilization Efficiency
## 1       R1         CE         100  12.636364
## 2       R5         CE         100   7.333333
## 3       R6        ECE         100   3.916667
## 4       R9        BBA         100   1.426230
## 5      R11         ME         100  10.583333
## 6      R17         CE         100   1.381818
## 7      R21         IT         100   2.511111
## 8      R26        BBA         100   1.256410
## 9      R27         IT         100   1.215385
## 10     R28         CS         100   1.734694

Answer: This identifies the top 10 rooms with highest utilization. These rooms are being used efficiently and indicate high demand areas in the campus.

###Question 2.3: Identify underutilized rooms

campus %>%
  filter(Utilization < 0.3) %>%
  select(Room_ID, Capacity, Students_Used, Utilization)
## [1] Room_ID       Capacity      Students_Used Utilization  
## <0 rows> (or 0-length row.names)
range(campus$Utilization)
## [1]   6.756757 100.000000

Answer: The utilization values range from approximately 6.75 to 100, indicating that the data is in percentage format. Initially, using a threshold of 0.3 resulted in no records. After correcting the condition to Utilization < 30, the output identifies underutilized rooms with low usage efficiency. Helps identify wastage of resources. Useful for improving scheduling and allocation.

2.3 Question 2.4: Average utilization by department

campus %>%
  group_by(Department) %>%
  summarise(avg_util = mean(Utilization, na.rm = TRUE)) %>%
  arrange(desc(avg_util))
## # A tibble: 6 × 2
##   Department avg_util
##   <chr>         <dbl>
## 1 ME             75.2
## 2 ECE            74.9
## 3 CS             74.8
## 4 BBA            74.7
## 5 CE             73.6
## 6 IT             72.7

Answer: Compares department-wise efficiency. The data was grouped by department, and the average utilization was calculated for each group. The results were sorted in descending order to identify departments with the highest and lowest utilization. “Departments with low utilization should optimize classroom allocation to improve efficiency. ### Question 2.5: Average electricity usage by floor

campus %>%
  group_by(Floor) %>%
  summarise(avg_electricity = mean(Electricity, na.rm = TRUE))
## # A tibble: 7 × 2
##   Floor avg_electricity
##   <int>           <dbl>
## 1     1            41.5
## 2     2            43.0
## 3     3            42.0
## 4     4            42.5
## 5     5            42.2
## 6     6            41.9
## 7     7            42.4

Answer: Shows energy consumption trends.

3 Level 3: DATA TRANSFORMATION

3.1 Question 3.1: Create utilization category

campus$Utilization_Level <- ifelse(campus$Utilization > 70, "High",
                           ifelse(campus$Utilization > 40, "Medium","Low"))

Answer: Categorizes rooms based on usage levels.

3.2 Question 3.2: Count rooms in each category

table(campus$Utilization_Level)
## 
##   High    Low Medium 
##   4621   1396   1483

Answer: Shows distribution of High, Medium, Low usage. Categorization helps in better decision-making and comparison across different levels.

3.3 Question 3.3: Average efficiency by department

campus %>%
  group_by(Department) %>%
  summarise(avg_eff = mean(Efficiency, na.rm = TRUE)) %>%
  arrange(desc(avg_eff))
## # A tibble: 6 × 2
##   Department avg_eff
##   <chr>        <dbl>
## 1 ME            2.48
## 2 CS            2.48
## 3 CE            2.39
## 4 BBA           2.36
## 5 IT            2.32
## 6 ECE           2.29

Answer: The dataset was grouped by department, and the average efficiency was calculated. The results were sorted in descending order to identify the most and least efficient departments. Departments with low efficiency should optimize resource usage to improve performance

3.4 Question 3.4: Top 10 most efficient rooms

campus %>%
  arrange(desc(Efficiency)) %>%
  select(Room_ID, Efficiency) %>%
  head(10)
##    Room_ID Efficiency
## 1    R7015   28.80000
## 2     R117   28.20000
## 3    R6971   26.60000
## 4    R1182   26.40000
## 5    R5469   25.20000
## 6    R4038   24.40000
## 7    R5903   24.00000
## 8    R6621   23.00000
## 9    R1906   22.00000
## 10    R510   21.66667

Answer: The dataset was sorted in descending order of efficiency to identify the top 10 most efficient rooms. Only relevant columns were selected for clarity.

4 Level 4: DATA VISUALIZATION

4.1 Question 4.1: Histogram of utilization

library(ggplot2)

ggplot(campus, aes(x = Utilization)) +
  geom_histogram(bins = 30) +
  labs(title = "Utilization Distribution",
       x = "Utilization",
       y = "Count")

Answer: A histogram was plotted to understand the distribution of utilization values. It shows how frequently different utilization levels occur in the dataset.

4.2 Question 4.2: Boxplot of electricity usage by department

ggplot(campus, aes(x = Department, y = Electricity)) +
  geom_boxplot() +
  labs(title = "Electricity Usage by Department",
       x = "Department",
       y = "Electricity")

Answer:

A boxplot was used to compare electricity usage across departments. It shows the median, spread, and presence of outliers for each department.

4.3 Question 4.3: Scatter plot (Capacity vs Students Used)

ggplot(campus, aes(x = Capacity, y = Students_Used)) +
  geom_point() +
  labs(title = "Capacity vs Students Used",
       x = "Capacity",
       y = "Students Used")

Answer: A scatter plot was used to analyze the relationship between classroom capacity and the number of students using it. Each point represents a classroom.

4.4 Question 4.4: Bar chart for load levels

ggplot(campus, aes(x = Load_Level)) +
  geom_bar() +
  labs(title = "Load Level Distribution",
       x = "Load Level",
       y = "Count")

Answer: A bar chart was used to show the distribution of load levels across the dataset. It represents how many observations fall into each category such as Low, Medium, and High. If most values are in low category, it indicates poor resource utilization. Most classrooms fall in the higher category, which means overall utilization is good, but some are still underutilized

4.5 Question 4.5: Density Plot (Distribution Shape)

ggplot(campus, aes(x = Utilization)) +
  geom_density(fill = "green") +
  labs(title = "Density Plot of Utilization") +
  theme_minimal()

Answer: A Density plot is used to understand the overall shape of utilization data. It is a smooth curve that shows how values are distributed. It helps us see whether the data is balanced or skewed. The curve is slightly shifted, which shows the data is left-skewed, meaning most values are higher.

4.6 Question 4.6: Scatter Plot (Relationship Analysis)

ggplot(campus, aes(x = Capacity, y = Electricity)) +
  geom_point(color = "blue") +
  labs(title = "Capacity vs Electricity") +
  theme_minimal()

Answer: scatter plot is used to show the relationship between capacity and electricity usage. Each point represents one classroom. From the graph, we can see that as capacity increases, electricity usage also increases.

4.7 Question 4.7: Bar Chart Showing Average Utilization by Department

ggplot(campus, aes(x = Department, y = Utilization)) +
  geom_bar(stat = "summary", fun = "mean") +
  theme_minimal()

Answer:

A bar chart was used to compare the average utilization across different departments. The mean function was applied to calculate the average utilization for each department It helps identify which department is using classrooms more efficiently.

5 Level 5: EXPLORATORY DATA ANALYSIS

5.1 Question 5.1: Outlier Detection using IQR Method

Q1 <- quantile(campus$Utilization, 0.25)
Q3 <- quantile(campus$Utilization, 0.75)
IQR_value <- Q3 - Q1

lower <- Q1 - 1.5 * IQR_value
upper <- Q3 + 1.5 * IQR_value

campus$Utilization[campus$Utilization < lower | campus$Utilization > upper]
## numeric(0)

Answer IQR shows the spread of middle 50% data and helps identify variability.

5.2 Question 5.2: Detect Outliers using Z-score Method

z_scores <- (campus$Utilization - mean(campus$Utilization, na.rm = TRUE)) / 
            sd(campus$Utilization, na.rm = TRUE)

outliers <- campus$Utilization[abs(z_scores) > 3]

outliers
## numeric(0)

Answer: Outliers were detected using the Z-score method. Values with Z-score greater than ±3 were considered extreme and identified as outliers.

5.3 Question 5.3:Detect skewness (Left or Right Skewed Data)

library(moments)
skewness(campus$Utilization)
## [1] -0.7341737

Answer: Skewness was calculated to understand the shape of the distribution. A negative value indicates that the data is left-skewed, meaning most values are concentrated on the higher side.