library(tidyverse) #this is run to load the library before everything
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl) #read raw excel file
ttc_subway_delay_data_2024 <- read_excel("ttc-subway-delay-data-2024.xlsx", 
    col_types = c("date", "text", "text", 
        "text", "text", "numeric", "numeric", 
        "text", "text", "text")) #read excel as designated data types from the designated file directory
View(ttc_subway_delay_data_2024)
#Print the structure of your dataset.
str(ttc_subway_delay_data_2024) #str() function to print the structure
## tibble [26,467 × 10] (S3: tbl_df/tbl/data.frame)
##  $ Date     : POSIXct[1:26467], format: "2024-01-01" "2024-01-01" ...
##  $ Time     : chr [1:26467] "02:00" "02:00" "02:08" "02:13" ...
##  $ Day      : chr [1:26467] "Monday" "Monday" "Monday" "Monday" ...
##  $ Station  : chr [1:26467] "SHEPPARD STATION" "DUNDAS STATION" "DUNDAS STATION" "KENNEDY BD STATION" ...
##  $ Code     : chr [1:26467] "MUI" "MUIS" "MUPAA" "PUTDN" ...
##  $ Min Delay: num [1:26467] 0 0 4 10 4 3 0 7 0 13 ...
##  $ Min Gap  : num [1:26467] 0 0 10 16 10 9 0 13 0 19 ...
##  $ Bound    : chr [1:26467] "N" "N" "N" "E" ...
##  $ Line     : chr [1:26467] "YU" "YU" "YU" "BD" ...
##  $ Vehicle  : chr [1:26467] "5491" "0" "6051" "5284" ...
#List the variables in your dataset.
ls(ttc_subway_delay_data_2024) #ls() function to list the variables
##  [1] "Bound"     "Code"      "Date"      "Day"       "Line"      "Min Delay"
##  [7] "Min Gap"   "Station"   "Time"      "Vehicle"
#Print the top 15 rows of your dataset.
head(ttc_subway_delay_data_2024, n=15) #head() function to print the user-defined number of top rows
## # A tibble: 15 × 10
##    Date                Time  Day    Station    Code  `Min Delay` `Min Gap` Bound
##    <dttm>              <chr> <chr>  <chr>      <chr>       <dbl>     <dbl> <chr>
##  1 2024-01-01 00:00:00 02:00 Monday SHEPPARD … MUI             0         0 N    
##  2 2024-01-01 00:00:00 02:00 Monday DUNDAS ST… MUIS            0         0 N    
##  3 2024-01-01 00:00:00 02:08 Monday DUNDAS ST… MUPAA           4        10 N    
##  4 2024-01-01 00:00:00 02:13 Monday KENNEDY B… PUTDN          10        16 E    
##  5 2024-01-01 00:00:00 02:22 Monday BLOOR STA… MUPAA           4        10 N    
##  6 2024-01-01 00:00:00 02:25 Monday ST CLAIR … MUPAA           3         9 N    
##  7 2024-01-01 00:00:00 02:25 Monday BLOOR STA… MUIRS           0         0 S    
##  8 2024-01-01 00:00:00 02:27 Monday WOODBINE … EUDO            7        13 E    
##  9 2024-01-01 00:00:00 02:28 Monday FINCH STA… MUIRS           0         0 S    
## 10 2024-01-01 00:00:00 02:30 Monday DAVISVILL… MUI            13        19 N    
## 11 2024-01-01 00:00:00 02:36 Monday FINCH STA… MUI            24        30 S    
## 12 2024-01-01 00:00:00 02:45 Monday COLLEGE S… MUIR           12        18 N    
## 13 2024-01-01 00:00:00 03:01 Monday LAWRENCE … SUDP            3         9 N    
## 14 2024-01-01 00:00:00 03:06 Monday LAWRENCE … MUIS            0         0 S    
## 15 2024-01-01 00:00:00 03:17 Monday VICTORIA … MUIS            0         0 <NA> 
## # ℹ 2 more variables: Line <chr>, Vehicle <chr>
#Write a user defined function using any of the variables from the data set.
remove_zero = function(x){
  x %>% filter(`Min Delay`>0, `Min Gap`>0)
  } #define a function that remove zero min delay and zero min gap rows from the data set
ttc_no_zero = remove_zero(ttc_subway_delay_data_2024) #run the defined function and put it as a data set named ttc_no_zero
#Use data manipulation techniques and filter rows based on any logical criteria that exist in your dataset.
ttc_subway_delay_data_2024 %>% filter(str_detect(Line, "YU")) #filter out delays that contain "YU" in line
## # A tibble: 14,534 × 10
##    Date                Time  Day    Station    Code  `Min Delay` `Min Gap` Bound
##    <dttm>              <chr> <chr>  <chr>      <chr>       <dbl>     <dbl> <chr>
##  1 2024-01-01 00:00:00 02:00 Monday SHEPPARD … MUI             0         0 N    
##  2 2024-01-01 00:00:00 02:00 Monday DUNDAS ST… MUIS            0         0 N    
##  3 2024-01-01 00:00:00 02:08 Monday DUNDAS ST… MUPAA           4        10 N    
##  4 2024-01-01 00:00:00 02:22 Monday BLOOR STA… MUPAA           4        10 N    
##  5 2024-01-01 00:00:00 02:25 Monday ST CLAIR … MUPAA           3         9 N    
##  6 2024-01-01 00:00:00 02:25 Monday BLOOR STA… MUIRS           0         0 S    
##  7 2024-01-01 00:00:00 02:28 Monday FINCH STA… MUIRS           0         0 S    
##  8 2024-01-01 00:00:00 02:30 Monday DAVISVILL… MUI            13        19 N    
##  9 2024-01-01 00:00:00 02:36 Monday FINCH STA… MUI            24        30 S    
## 10 2024-01-01 00:00:00 02:45 Monday COLLEGE S… MUIR           12        18 N    
## # ℹ 14,524 more rows
## # ℹ 2 more variables: Line <chr>, Vehicle <chr>
#Identify the dependent & independent variables and use reshaping techniques and create a new data frame by joining those variables from your dataset.
#dependent variables: Min Delay, Min Gap
dependent_v_table = data.frame(
  Min_Delay = ttc_subway_delay_data_2024$`Min Delay`,
  Min_Gap = ttc_subway_delay_data_2024$`Min Gap`) #put dependent variables to a data frame called dependent_v_table
#independent variables: Date, Time, Day, Station, Line, Code, Bound, Vehicle
independent_v_table = data.frame(
  Date = ttc_subway_delay_data_2024$Date,
  Time = ttc_subway_delay_data_2024$Time,
  Day = ttc_subway_delay_data_2024$Day,
  Station = ttc_subway_delay_data_2024$Station,
  Line = ttc_subway_delay_data_2024$Line, 
  Code = ttc_subway_delay_data_2024$Code, 
  Bound = ttc_subway_delay_data_2024$Bound, 
  Vehicle = ttc_subway_delay_data_2024$Vehicle) #put independent variables to a data frame called independent_v_table
dependent_n_independent = cbind(dependent_v_table,independent_v_table) #this code combine the dependent and independent datasets to the new dataset, dependent_n_independent using cbind()
#reshaping con't
#combining rows
union_delay = ttc_subway_delay_data_2024 %>% filter(str_detect(Station, "UNION")) #find out delays that have the string "UNION" in station variable
dupont_delay = ttc_subway_delay_data_2024 %>% filter(str_detect(Station, "DUPONT")) #find out delays that have the string "DUPONT" in station variable
union_and_dupont = rbind(union_delay,dupont_delay) #combine the two data sets
#Remove missing values in your dataset.
ttc_no_blank = ttc_subway_delay_data_2024 %>% drop_na() #remove all missing values in the data set
#Identify and remove duplicated data from your dataset.
unique(ttc_subway_delay_data_2024$Line) #find out unique line
##  [1] "YU"                     "BD"                     "YUS"                   
##  [4] "YU/BD"                  "SHP"                    NA                      
##  [7] "BLOOR DANFORTH"         "YU / BD"                "YU/ BD"                
## [10] "SRT"                    "YUS/BD"                 "SHEP"                  
## [13] "LINE 1"                 "TRACK LEVEL ACTIVITY"   "YU & BD"               
## [16] "109 RANEE"              "ONGE-UNIVERSITY AND BL" "YU/BD/SHP"             
## [19] "BD/ YUS"                "BD/ YU"                 "BD/YU"                 
## [22] "BD / YU"                "20 CLIFFSIDE"
is_duplicated = duplicated(ttc_subway_delay_data_2024) #find if the rows are duplicated in booleans
duplicates_only = ttc_subway_delay_data_2024[is_duplicated,] #put all the rows that are indicated as duplicated to the data set called duplicates_only

distinct_ttc = ttc_subway_delay_data_2024 %>% distinct() #extract the distinct entries from the raw data set
#Reorder multiple rows in descending order
ttc_subway_delay_data_2024 %>% arrange(desc(Station),desc(Day), desc(Code)) #arrange in descending order according to station, day and then code
## # A tibble: 26,467 × 10
##    Date                Time  Day       Station Code  `Min Delay` `Min Gap` Bound
##    <dttm>              <chr> <chr>     <chr>   <chr>       <dbl>     <dbl> <chr>
##  1 2024-05-28 00:00:00 23:00 Tuesday   YU-ST … MUO             0         0 <NA> 
##  2 2024-04-30 00:00:00 23:00 Tuesday   YU- YO… MUO             0         0 <NA> 
##  3 2024-04-30 00:00:00 23:00 Tuesday   YU- YO… MUO             0         0 <NA> 
##  4 2024-11-10 00:00:00 23:33 Sunday    YU & B… MUWEA           0         0 <NA> 
##  5 2024-08-17 00:00:00 01:47 Saturday  YOUNGE… MUO             0         0 <NA> 
##  6 2024-03-17 00:00:00 02:07 Sunday    YOUNG … MUO             0         0 <NA> 
##  7 2024-05-22 00:00:00 16:30 Wednesday YORKDA… TUS             7        12 N    
##  8 2024-04-24 00:00:00 13:24 Wednesday YORKDA… TUCC            6        10 S    
##  9 2024-06-05 00:00:00 17:53 Wednesday YORKDA… SUUT           17        20 N    
## 10 2024-05-15 00:00:00 08:29 Wednesday YORKDA… SUO             3         6 S    
## # ℹ 26,457 more rows
## # ℹ 2 more variables: Line <chr>, Vehicle <chr>
#Rename some of the column names in your dataset.
names(ttc_subway_delay_data_2024)[5] = "Delay Reason Code" #rename Code to Delay Reason Code
names(ttc_subway_delay_data_2024)[10] = "Vehicle_number" #rename column Vehicle to Vehicle_number
#Add new variables in your data frame by using a mathematical function (for e.g. – multiply an existing column by 2 and add it as a new variable to your data frame)
ttc_subway_delay_data_2024$delay_plus_gap = ttc_subway_delay_data_2024$`Min Delay` + ttc_subway_delay_data_2024$`Min Gap` #make an addition to the delay and gap columns and put it as delay_plus_gap
ttc_subway_delay_data_2024$delay_multiply_gap = ttc_subway_delay_data_2024$`Min Delay`*ttc_subway_delay_data_2024$`Min Gap` #multiply min delay and min gap and create a new column to the data frame
#Create a training set using a random number generator engine.
set.seed(333) #make it reproducible
training_ttc = ttc_subway_delay_data_2024 %>% sample_n(100, replace = FALSE) #produce training set with 100 data
#Print the summary statistics of your dataset.
summary(ttc_subway_delay_data_2024)
##       Date                         Time               Day           
##  Min.   :2024-01-01 00:00:00   Length:26467       Length:26467      
##  1st Qu.:2024-04-01 00:00:00   Class :character   Class :character  
##  Median :2024-07-02 00:00:00   Mode  :character   Mode  :character  
##  Mean   :2024-07-01 21:44:28                                        
##  3rd Qu.:2024-10-03 00:00:00                                        
##  Max.   :2024-12-31 00:00:00                                        
##    Station          Delay Reason Code    Min Delay          Min Gap       
##  Length:26467       Length:26467       Min.   :  0.000   Min.   :  0.000  
##  Class :character   Class :character   1st Qu.:  0.000   1st Qu.:  0.000  
##  Mode  :character   Mode  :character   Median :  0.000   Median :  0.000  
##                                        Mean   :  2.806   Mean   :  4.111  
##                                        3rd Qu.:  4.000   3rd Qu.:  8.000  
##                                        Max.   :716.000   Max.   :719.000  
##     Bound               Line           Vehicle_number     delay_plus_gap    
##  Length:26467       Length:26467       Length:26467       Min.   :   0.000  
##  Class :character   Class :character   Class :character   1st Qu.:   0.000  
##  Mode  :character   Mode  :character   Mode  :character   Median :   0.000  
##                                                           Mean   :   6.917  
##                                                           3rd Qu.:  12.000  
##                                                           Max.   :1435.000  
##  delay_multiply_gap
##  Min.   :     0.0  
##  1st Qu.:     0.0  
##  Median :     0.0  
##  Mean   :   116.1  
##  3rd Qu.:    28.0  
##  Max.   :514804.0
#Use any of the numerical variables from the dataset and perform the following statistical functions.
#mean
mean(ttc_subway_delay_data_2024$`Min Gap`) #find the mean for the column Min Gap
## [1] 4.110628
#median
median(ttc_subway_delay_data_2024$`Min Delay`) #find the median for the column Min Delay
## [1] 0
#mode
freq_table = table(ttc_subway_delay_data_2024$`Min Delay`) #create a frequency table on Min Delay
max_freq = max(freq_table) #find the max frequency from the frequency table
modes <- names(freq_table[freq_table == max_freq]) #get the value with the greatest frequency
modes #display mode
## [1] "0"
#range
range(ttc_subway_delay_data_2024$`Min Delay`) #find the range for Min Delay
## [1]   0 716
#Plot a scatter plot for any 2 variables in your dataset.
ggplot(ttc_subway_delay_data_2024, aes(x=`Min Delay`, y=`Min Gap`))+geom_point()

#Plot a bar plot for any 2 variables in your dataset
desired_day_order = c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday") #arrange day in a logical order
day_manual_order <- ttc_subway_delay_data_2024 %>% mutate(Day = factor(Day, levels = desired_day_order)) #create new dataset with ordered day
ggplot(day_manual_order, aes(x=Day, fill = Bound, colour = Bound))+geom_bar(position="dodge") #create bar plot on the count of bound and day

#Find the correlation between any 2 variables by applying least square linear regression model.
ttc_corr = cor(ttc_subway_delay_data_2024$`Min Delay`,ttc_subway_delay_data_2024$`Min Gap`,method = "pearson") #find correlation between min delay and min gap with least square linear regression model