library(tidyverse) #this is run to load the library before everything
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl) #read raw excel file
ttc_subway_delay_data_2024 <- read_excel("ttc-subway-delay-data-2024.xlsx",
col_types = c("date", "text", "text",
"text", "text", "numeric", "numeric",
"text", "text", "text")) #read excel as designated data types from the designated file directory
View(ttc_subway_delay_data_2024)
#Print the structure of your dataset.
str(ttc_subway_delay_data_2024) #str() function to print the structure
## tibble [26,467 × 10] (S3: tbl_df/tbl/data.frame)
## $ Date : POSIXct[1:26467], format: "2024-01-01" "2024-01-01" ...
## $ Time : chr [1:26467] "02:00" "02:00" "02:08" "02:13" ...
## $ Day : chr [1:26467] "Monday" "Monday" "Monday" "Monday" ...
## $ Station : chr [1:26467] "SHEPPARD STATION" "DUNDAS STATION" "DUNDAS STATION" "KENNEDY BD STATION" ...
## $ Code : chr [1:26467] "MUI" "MUIS" "MUPAA" "PUTDN" ...
## $ Min Delay: num [1:26467] 0 0 4 10 4 3 0 7 0 13 ...
## $ Min Gap : num [1:26467] 0 0 10 16 10 9 0 13 0 19 ...
## $ Bound : chr [1:26467] "N" "N" "N" "E" ...
## $ Line : chr [1:26467] "YU" "YU" "YU" "BD" ...
## $ Vehicle : chr [1:26467] "5491" "0" "6051" "5284" ...
#List the variables in your dataset.
ls(ttc_subway_delay_data_2024) #ls() function to list the variables
## [1] "Bound" "Code" "Date" "Day" "Line" "Min Delay"
## [7] "Min Gap" "Station" "Time" "Vehicle"
#Print the top 15 rows of your dataset.
head(ttc_subway_delay_data_2024, n=15) #head() function to print the user-defined number of top rows
## # A tibble: 15 × 10
## Date Time Day Station Code `Min Delay` `Min Gap` Bound
## <dttm> <chr> <chr> <chr> <chr> <dbl> <dbl> <chr>
## 1 2024-01-01 00:00:00 02:00 Monday SHEPPARD … MUI 0 0 N
## 2 2024-01-01 00:00:00 02:00 Monday DUNDAS ST… MUIS 0 0 N
## 3 2024-01-01 00:00:00 02:08 Monday DUNDAS ST… MUPAA 4 10 N
## 4 2024-01-01 00:00:00 02:13 Monday KENNEDY B… PUTDN 10 16 E
## 5 2024-01-01 00:00:00 02:22 Monday BLOOR STA… MUPAA 4 10 N
## 6 2024-01-01 00:00:00 02:25 Monday ST CLAIR … MUPAA 3 9 N
## 7 2024-01-01 00:00:00 02:25 Monday BLOOR STA… MUIRS 0 0 S
## 8 2024-01-01 00:00:00 02:27 Monday WOODBINE … EUDO 7 13 E
## 9 2024-01-01 00:00:00 02:28 Monday FINCH STA… MUIRS 0 0 S
## 10 2024-01-01 00:00:00 02:30 Monday DAVISVILL… MUI 13 19 N
## 11 2024-01-01 00:00:00 02:36 Monday FINCH STA… MUI 24 30 S
## 12 2024-01-01 00:00:00 02:45 Monday COLLEGE S… MUIR 12 18 N
## 13 2024-01-01 00:00:00 03:01 Monday LAWRENCE … SUDP 3 9 N
## 14 2024-01-01 00:00:00 03:06 Monday LAWRENCE … MUIS 0 0 S
## 15 2024-01-01 00:00:00 03:17 Monday VICTORIA … MUIS 0 0 <NA>
## # ℹ 2 more variables: Line <chr>, Vehicle <chr>
#Write a user defined function using any of the variables from the data set.
remove_zero = function(x){
x %>% filter(`Min Delay`>0, `Min Gap`>0)
} #define a function that remove zero min delay and zero min gap rows from the data set
ttc_no_zero = remove_zero(ttc_subway_delay_data_2024) #run the defined function and put it as a data set named ttc_no_zero
#Use data manipulation techniques and filter rows based on any logical criteria that exist in your dataset.
ttc_subway_delay_data_2024 %>% filter(str_detect(Line, "YU")) #filter out delays that contain "YU" in line
## # A tibble: 14,534 × 10
## Date Time Day Station Code `Min Delay` `Min Gap` Bound
## <dttm> <chr> <chr> <chr> <chr> <dbl> <dbl> <chr>
## 1 2024-01-01 00:00:00 02:00 Monday SHEPPARD … MUI 0 0 N
## 2 2024-01-01 00:00:00 02:00 Monday DUNDAS ST… MUIS 0 0 N
## 3 2024-01-01 00:00:00 02:08 Monday DUNDAS ST… MUPAA 4 10 N
## 4 2024-01-01 00:00:00 02:22 Monday BLOOR STA… MUPAA 4 10 N
## 5 2024-01-01 00:00:00 02:25 Monday ST CLAIR … MUPAA 3 9 N
## 6 2024-01-01 00:00:00 02:25 Monday BLOOR STA… MUIRS 0 0 S
## 7 2024-01-01 00:00:00 02:28 Monday FINCH STA… MUIRS 0 0 S
## 8 2024-01-01 00:00:00 02:30 Monday DAVISVILL… MUI 13 19 N
## 9 2024-01-01 00:00:00 02:36 Monday FINCH STA… MUI 24 30 S
## 10 2024-01-01 00:00:00 02:45 Monday COLLEGE S… MUIR 12 18 N
## # ℹ 14,524 more rows
## # ℹ 2 more variables: Line <chr>, Vehicle <chr>
#Identify the dependent & independent variables and use reshaping techniques and create a new data frame by joining those variables from your dataset.
#dependent variables: Min Delay, Min Gap
dependent_v_table = data.frame(
Min_Delay = ttc_subway_delay_data_2024$`Min Delay`,
Min_Gap = ttc_subway_delay_data_2024$`Min Gap`) #put dependent variables to a data frame called dependent_v_table
#independent variables: Date, Time, Day, Station, Line, Code, Bound, Vehicle
independent_v_table = data.frame(
Date = ttc_subway_delay_data_2024$Date,
Time = ttc_subway_delay_data_2024$Time,
Day = ttc_subway_delay_data_2024$Day,
Station = ttc_subway_delay_data_2024$Station,
Line = ttc_subway_delay_data_2024$Line,
Code = ttc_subway_delay_data_2024$Code,
Bound = ttc_subway_delay_data_2024$Bound,
Vehicle = ttc_subway_delay_data_2024$Vehicle) #put independent variables to a data frame called independent_v_table
dependent_n_independent = cbind(dependent_v_table,independent_v_table) #this code combine the dependent and independent datasets to the new dataset, dependent_n_independent using cbind()
#reshaping con't
#combining rows
union_delay = ttc_subway_delay_data_2024 %>% filter(str_detect(Station, "UNION")) #find out delays that have the string "UNION" in station variable
dupont_delay = ttc_subway_delay_data_2024 %>% filter(str_detect(Station, "DUPONT")) #find out delays that have the string "DUPONT" in station variable
union_and_dupont = rbind(union_delay,dupont_delay) #combine the two data sets
#Remove missing values in your dataset.
ttc_no_blank = ttc_subway_delay_data_2024 %>% drop_na() #remove all missing values in the data set
#Identify and remove duplicated data from your dataset.
unique(ttc_subway_delay_data_2024$Line) #find out unique line
## [1] "YU" "BD" "YUS"
## [4] "YU/BD" "SHP" NA
## [7] "BLOOR DANFORTH" "YU / BD" "YU/ BD"
## [10] "SRT" "YUS/BD" "SHEP"
## [13] "LINE 1" "TRACK LEVEL ACTIVITY" "YU & BD"
## [16] "109 RANEE" "ONGE-UNIVERSITY AND BL" "YU/BD/SHP"
## [19] "BD/ YUS" "BD/ YU" "BD/YU"
## [22] "BD / YU" "20 CLIFFSIDE"
is_duplicated = duplicated(ttc_subway_delay_data_2024) #find if the rows are duplicated in booleans
duplicates_only = ttc_subway_delay_data_2024[is_duplicated,] #put all the rows that are indicated as duplicated to the data set called duplicates_only
distinct_ttc = ttc_subway_delay_data_2024 %>% distinct() #extract the distinct entries from the raw data set
#Reorder multiple rows in descending order
ttc_subway_delay_data_2024 %>% arrange(desc(Station),desc(Day), desc(Code)) #arrange in descending order according to station, day and then code
## # A tibble: 26,467 × 10
## Date Time Day Station Code `Min Delay` `Min Gap` Bound
## <dttm> <chr> <chr> <chr> <chr> <dbl> <dbl> <chr>
## 1 2024-05-28 00:00:00 23:00 Tuesday YU-ST … MUO 0 0 <NA>
## 2 2024-04-30 00:00:00 23:00 Tuesday YU- YO… MUO 0 0 <NA>
## 3 2024-04-30 00:00:00 23:00 Tuesday YU- YO… MUO 0 0 <NA>
## 4 2024-11-10 00:00:00 23:33 Sunday YU & B… MUWEA 0 0 <NA>
## 5 2024-08-17 00:00:00 01:47 Saturday YOUNGE… MUO 0 0 <NA>
## 6 2024-03-17 00:00:00 02:07 Sunday YOUNG … MUO 0 0 <NA>
## 7 2024-05-22 00:00:00 16:30 Wednesday YORKDA… TUS 7 12 N
## 8 2024-04-24 00:00:00 13:24 Wednesday YORKDA… TUCC 6 10 S
## 9 2024-06-05 00:00:00 17:53 Wednesday YORKDA… SUUT 17 20 N
## 10 2024-05-15 00:00:00 08:29 Wednesday YORKDA… SUO 3 6 S
## # ℹ 26,457 more rows
## # ℹ 2 more variables: Line <chr>, Vehicle <chr>
#Rename some of the column names in your dataset.
names(ttc_subway_delay_data_2024)[5] = "Delay Reason Code" #rename Code to Delay Reason Code
names(ttc_subway_delay_data_2024)[10] = "Vehicle_number" #rename column Vehicle to Vehicle_number
#Add new variables in your data frame by using a mathematical function (for e.g. – multiply an existing column by 2 and add it as a new variable to your data frame)
ttc_subway_delay_data_2024$delay_plus_gap = ttc_subway_delay_data_2024$`Min Delay` + ttc_subway_delay_data_2024$`Min Gap` #make an addition to the delay and gap columns and put it as delay_plus_gap
ttc_subway_delay_data_2024$delay_multiply_gap = ttc_subway_delay_data_2024$`Min Delay`*ttc_subway_delay_data_2024$`Min Gap` #multiply min delay and min gap and create a new column to the data frame
#Create a training set using a random number generator engine.
set.seed(333) #make it reproducible
training_ttc = ttc_subway_delay_data_2024 %>% sample_n(100, replace = FALSE) #produce training set with 100 data
#Print the summary statistics of your dataset.
summary(ttc_subway_delay_data_2024)
## Date Time Day
## Min. :2024-01-01 00:00:00 Length:26467 Length:26467
## 1st Qu.:2024-04-01 00:00:00 Class :character Class :character
## Median :2024-07-02 00:00:00 Mode :character Mode :character
## Mean :2024-07-01 21:44:28
## 3rd Qu.:2024-10-03 00:00:00
## Max. :2024-12-31 00:00:00
## Station Delay Reason Code Min Delay Min Gap
## Length:26467 Length:26467 Min. : 0.000 Min. : 0.000
## Class :character Class :character 1st Qu.: 0.000 1st Qu.: 0.000
## Mode :character Mode :character Median : 0.000 Median : 0.000
## Mean : 2.806 Mean : 4.111
## 3rd Qu.: 4.000 3rd Qu.: 8.000
## Max. :716.000 Max. :719.000
## Bound Line Vehicle_number delay_plus_gap
## Length:26467 Length:26467 Length:26467 Min. : 0.000
## Class :character Class :character Class :character 1st Qu.: 0.000
## Mode :character Mode :character Mode :character Median : 0.000
## Mean : 6.917
## 3rd Qu.: 12.000
## Max. :1435.000
## delay_multiply_gap
## Min. : 0.0
## 1st Qu.: 0.0
## Median : 0.0
## Mean : 116.1
## 3rd Qu.: 28.0
## Max. :514804.0
#Use any of the numerical variables from the dataset and perform the following statistical functions.
#mean
mean(ttc_subway_delay_data_2024$`Min Gap`) #find the mean for the column Min Gap
## [1] 4.110628
#median
median(ttc_subway_delay_data_2024$`Min Delay`) #find the median for the column Min Delay
## [1] 0
#mode
freq_table = table(ttc_subway_delay_data_2024$`Min Delay`) #create a frequency table on Min Delay
max_freq = max(freq_table) #find the max frequency from the frequency table
modes <- names(freq_table[freq_table == max_freq]) #get the value with the greatest frequency
modes #display mode
## [1] "0"
#range
range(ttc_subway_delay_data_2024$`Min Delay`) #find the range for Min Delay
## [1] 0 716
#Plot a scatter plot for any 2 variables in your dataset.
ggplot(ttc_subway_delay_data_2024, aes(x=`Min Delay`, y=`Min Gap`))+geom_point()

#Plot a bar plot for any 2 variables in your dataset
desired_day_order = c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday") #arrange day in a logical order
day_manual_order <- ttc_subway_delay_data_2024 %>% mutate(Day = factor(Day, levels = desired_day_order)) #create new dataset with ordered day
ggplot(day_manual_order, aes(x=Day, fill = Bound, colour = Bound))+geom_bar(position="dodge") #create bar plot on the count of bound and day

#Find the correlation between any 2 variables by applying least square linear regression model.
ttc_corr = cor(ttc_subway_delay_data_2024$`Min Delay`,ttc_subway_delay_data_2024$`Min Gap`,method = "pearson") #find correlation between min delay and min gap with least square linear regression model