#read csv file
df <- read.csv("c2k_data_comma.csv")
head(df)
dim(df)
## [1] 3943 98
tail(df)
summary(df)
## nr i1_legid i1_rcs_p i1_rcs_e
## Min. : 0.0 Min. : 1 Min. : 5.0 Min. : 1
## 1st Qu.: 986.2 1st Qu.: 3591 1st Qu.: 377.8 1st Qu.: 113
## Median :1971.5 Median : 7430 Median : 1085.0 Median : 340
## Mean :1971.3 Mean : 7304 Mean : 2203.5 Mean : 1653
## 3rd Qu.:2956.8 3rd Qu.:10922 3rd Qu.: 1946.5 3rd Qu.: 1375
## Max. :3942.0 Max. :14661 Max. :47190.0 Max. :46357
## NA's :1 NA's :1 NA's :1 NA's :1
## i1_dep_1_p i1_dep_1_e i1_dep_1_place i1_rcf_1_p
## Min. : 75.0 Min. : 4.0 Min. :101.0 Min. : 50
## 1st Qu.: 120.0 1st Qu.: 263.0 1st Qu.:308.0 1st Qu.: 455
## Median : 180.0 Median : 516.0 Median :485.0 Median : 820
## Mean : 205.9 Mean : 711.4 Mean :503.7 Mean : 796
## 3rd Qu.: 240.0 3rd Qu.: 949.8 3rd Qu.:700.0 3rd Qu.:1020
## Max. :2876.0 Max. :9513.0 Max. :815.0 Max. :5001
## NA's :1 NA's :1 NA's :1 NA's :1
## i1_rcf_1_e i1_rcf_1_place i1_dep_2_p i1_dep_2_e
## Min. : 13.0 Min. :101.0 Length:3943 Length:3943
## 1st Qu.: 274.0 1st Qu.:281.0 Class :character Class :character
## Median : 657.5 Median :485.0 Mode :character Mode :character
## Mean : 666.4 Mean :466.7
## 3rd Qu.: 883.0 3rd Qu.:700.0
## Max. :38116.0 Max. :815.0
## NA's :1 NA's :1
## i1_dep_2_place i1_rcf_2_p i1_rcf_2_e i1_rcf_2_place
## Length:3943 Length:3943 Length:3943 Length:3943
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## i1_dep_3_p i1_dep_3_e i1_dep_3_place i1_rcf_3_p
## Length:3943 Length:3943 Length:3943 Length:3943
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## i1_rcf_3_e i1_rcf_3_place i1_dlv_p i1_dlv_e
## Length:3943 Length:3943 Min. : 180 Min. : 1
## Class :character Class :character 1st Qu.:3420 1st Qu.: 212
## Mode :character Mode :character Median :3780 Median : 863
## Mean :3694 Mean : 2251
## 3rd Qu.:4230 3rd Qu.: 2176
## Max. :9915 Max. :90977
## NA's :1 NA's :1
## i1_hops i2_legid i2_rcs_p i2_rcs_e
## Min. :1.000 Length:3943 Length:3943 Length:3943
## 1st Qu.:1.000 Class :character Class :character Class :character
## Median :1.000 Mode :character Mode :character Mode :character
## Mean :1.309
## 3rd Qu.:2.000
## Max. :3.000
## NA's :1
## i2_dep_1_p i2_dep_1_e i2_dep_1_place i2_rcf_1_p
## Length:3943 Length:3943 Length:3943 Length:3943
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## i2_rcf_1_e i2_rcf_1_place i2_dep_2_p i2_dep_2_e
## Length:3943 Length:3943 Length:3943 Length:3943
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## i2_dep_2_place i2_rcf_2_p i2_rcf_2_e i2_rcf_2_place
## Length:3943 Length:3943 Length:3943 Length:3943
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## i2_dep_3_p i2_dep_3_e i2_dep_3_place i2_rcf_3_p
## Length:3943 Length:3943 Length:3943 Length:3943
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## i2_rcf_3_e i2_rcf_3_place i2_dlv_p i2_dlv_e
## Length:3943 Length:3943 Length:3943 Length:3943
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## i2_hops i3_legid i3_rcs_p i3_rcs_e
## Length:3943 Length:3943 Length:3943 Length:3943
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## i3_dep_1_p i3_dep_1_e i3_dep_1_place i3_rcf_1_p
## Length:3943 Length:3943 Length:3943 Length:3943
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## i3_rcf_1_e i3_rcf_1_place i3_dep_2_p i3_dep_2_e
## Length:3943 Length:3943 Length:3943 Length:3943
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## i3_dep_2_place i3_rcf_2_p i3_rcf_2_e i3_rcf_2_place
## Length:3943 Length:3943 Length:3943 Length:3943
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## i3_dep_3_p i3_dep_3_e i3_dep_3_place i3_rcf_3_p
## Length:3943 Length:3943 Length:3943 Length:3943
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## i3_rcf_3_e i3_rcf_3_place i3_dlv_p i3_dlv_e
## Length:3943 Length:3943 Length:3943 Length:3943
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## i3_hops o_legid o_rcs_p o_rcs_e
## Length:3943 Min. : 20 Min. : 5 Min. : 1
## Class :character 1st Qu.: 3674 1st Qu.: 746 1st Qu.: 241
## Mode :character Median : 7201 Median : 1392 Median : 717
## Mean : 7325 Mean : 2766 Mean : 2140
## 3rd Qu.:11097 3rd Qu.: 2717 3rd Qu.: 1840
## Max. :14660 Max. :49136 Max. :49015
## NA's :1 NA's :1 NA's :1
## o_dep_1_p o_dep_1_e o_dep_1_place o_rcf_1_p
## Min. : 90.0 Min. : 18.0 Min. :101.0 Min. : 45.0
## 1st Qu.: 90.0 1st Qu.: 303.0 1st Qu.:341.0 1st Qu.: 630.0
## Median : 180.0 Median : 600.0 Median :485.0 Median : 840.0
## Mean : 207.1 Mean : 803.9 Mean :479.7 Mean : 855.7
## 3rd Qu.: 240.0 3rd Qu.: 1029.5 3rd Qu.:671.0 3rd Qu.:1050.0
## Max. :1620.0 Max. :12256.0 Max. :815.0 Max. :4986.0
## NA's :1 NA's :1 NA's :1 NA's :1
## o_rcf_1_e o_rcf_1_place o_dep_2_p o_dep_2_e
## Min. : 1.0 Min. :100.0 Length:3943 Length:3943
## 1st Qu.: 385.0 1st Qu.:206.0 Class :character Class :character
## Median : 658.5 Median :476.0 Mode :character Mode :character
## Mean : 690.3 Mean :461.3
## 3rd Qu.: 920.0 3rd Qu.:700.0
## Max. :5520.0 Max. :815.0
## NA's :1 NA's :1
## o_dep_2_place o_rcf_2_p o_rcf_2_e o_rcf_2_place
## Length:3943 Length:3943 Length:3943 Length:3943
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## o_dep_3_p o_dep_3_e o_dep_3_place o_rcf_3_p
## Length:3943 Length:3943 Length:3943 Length:3943
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## o_rcf_3_e o_rcf_3_place o_dlv_p o_dlv_e
## Length:3943 Length:3943 Min. : 240 Min. : 1
## Class :character Class :character 1st Qu.: 2880 1st Qu.: 400
## Mode :character Mode :character Median : 3780 Median : 1202
## Mean : 3573 Mean : 3699
## 3rd Qu.: 4080 3rd Qu.: 3174
## Max. :11520 Max. :560130
## NA's :1 NA's :1
## o_hops legs
## Min. :1.000 Min. :1.000
## 1st Qu.:1.000 1st Qu.:1.000
## Median :1.000 Median :2.000
## Mean :1.475 Mean :2.012
## 3rd Qu.:2.000 3rd Qu.:3.000
## Max. :3.000 Max. :3.000
## NA's :1 NA's :1
• RCS: Check in freight at departure airline. Shipment is checked in and a receipt is produced at departure airport. • DEP: Confirm goods on board. Aircraft has departed with shipment on board. • RCF: Accept freight at arrival airline. Shipment is checked in according to the documents and stored at arrival warehouse. • DLV: Deliver freight. Receipt of shipment was signed at destination airport.
Null Data from the dataset
colSums(is.na(df))
## nr i1_legid i1_rcs_p i1_rcs_e i1_dep_1_p
## 1 1 1 1 1
## i1_dep_1_e i1_dep_1_place i1_rcf_1_p i1_rcf_1_e i1_rcf_1_place
## 1 1 1 1 1
## i1_dep_2_p i1_dep_2_e i1_dep_2_place i1_rcf_2_p i1_rcf_2_e
## 0 0 0 0 0
## i1_rcf_2_place i1_dep_3_p i1_dep_3_e i1_dep_3_place i1_rcf_3_p
## 0 0 0 0 0
## i1_rcf_3_e i1_rcf_3_place i1_dlv_p i1_dlv_e i1_hops
## 0 0 1 1 1
## i2_legid i2_rcs_p i2_rcs_e i2_dep_1_p i2_dep_1_e
## 0 0 0 0 0
## i2_dep_1_place i2_rcf_1_p i2_rcf_1_e i2_rcf_1_place i2_dep_2_p
## 0 0 0 0 0
## i2_dep_2_e i2_dep_2_place i2_rcf_2_p i2_rcf_2_e i2_rcf_2_place
## 0 0 0 0 0
## i2_dep_3_p i2_dep_3_e i2_dep_3_place i2_rcf_3_p i2_rcf_3_e
## 0 0 0 0 0
## i2_rcf_3_place i2_dlv_p i2_dlv_e i2_hops i3_legid
## 0 0 0 0 0
## i3_rcs_p i3_rcs_e i3_dep_1_p i3_dep_1_e i3_dep_1_place
## 0 0 0 0 0
## i3_rcf_1_p i3_rcf_1_e i3_rcf_1_place i3_dep_2_p i3_dep_2_e
## 0 0 0 0 0
## i3_dep_2_place i3_rcf_2_p i3_rcf_2_e i3_rcf_2_place i3_dep_3_p
## 0 0 0 0 0
## i3_dep_3_e i3_dep_3_place i3_rcf_3_p i3_rcf_3_e i3_rcf_3_place
## 0 0 0 0 0
## i3_dlv_p i3_dlv_e i3_hops o_legid o_rcs_p
## 0 0 0 1 1
## o_rcs_e o_dep_1_p o_dep_1_e o_dep_1_place o_rcf_1_p
## 1 1 1 1 1
## o_rcf_1_e o_rcf_1_place o_dep_2_p o_dep_2_e o_dep_2_place
## 1 1 0 0 0
## o_rcf_2_p o_rcf_2_e o_rcf_2_place o_dep_3_p o_dep_3_e
## 0 0 0 0 0
## o_dep_3_place o_rcf_3_p o_rcf_3_e o_rcf_3_place o_dlv_p
## 0 0 0 0 1
## o_dlv_e o_hops legs
## 1 1 1
# Identify rows with missing values
missing_rows <- complete.cases(df)
# Create a new data set that excludes the rows with missing values #row 30-33 missing value
df <- df[missing_rows,]
head(df)
df_num <- df[, sapply(df, is.numeric)] / 2
# view new data frame with numeric columns divided by scalar value
df_num;dim(df_num)
## [1] 3942 26
# drop column with unique id using subset()
df_new <- subset(df_num, select = -c(nr,i1_legid,i1_dep_1_place,i1_rcf_1_place,o_legid,o_dep_1_place,o_rcf_1_place))
dim(df_new) #remove this columns form the df_num
## [1] 3942 19
df_new_norm <- scale(df_new)
head(df_new_norm)
## i1_rcs_p i1_rcs_e i1_dep_1_p i1_dep_1_e i1_rcf_1_p i1_rcf_1_e
## 1 -0.4280143 -0.31518862 0.0292841 -0.72490767 0.3159088 0.09113646
## 2 -0.2902919 -0.23481716 -0.8261285 -0.60516278 2.5886796 0.97971022
## 3 0.4647260 0.54144812 -0.8261285 -0.62998794 0.2477257 -0.15619850
## 4 -0.3084413 -0.32594873 0.2431373 0.09578287 -1.0363898 -0.11693899
## 5 -0.1295089 -0.03696284 -0.3984222 -0.68693978 -0.4795610 -0.07113621
## 6 -0.4318577 -0.34153991 0.2431373 -0.66065432 -1.3204862 -0.58020130
## i1_dlv_p i1_dlv_e i1_hops o_rcs_p o_rcs_e o_dep_1_p
## 1 -1.67688092 -0.1274864 -0.6510436 1.47547899 1.1476822 0.2085311
## 2 0.05037233 0.6336340 -0.6510436 0.05445461 0.1854621 0.5891651
## 3 0.05037233 -0.3455632 -0.6510436 -0.27496561 -0.3173577 0.2085311
## 4 -1.97356898 -0.3072476 1.4560285 0.24690863 0.2193505 -0.7430537
## 5 0.31474783 -0.1783352 1.4560285 -0.53920313 -0.4154224 -0.1721028
## 6 0.70249856 -0.3824465 -0.6510436 -0.54250352 -0.4471929 0.2085311
## o_dep_1_e o_rcf_1_p o_rcf_1_e o_dlv_p o_dlv_e o_hops
## 1 2.52099640 1.07534234 1.28583845 -1.6103113 -0.22635318 -0.9261854
## 2 -0.63346378 -0.04688199 -0.36773199 0.1710832 -0.22559048 -0.9261854
## 3 0.20193610 0.78061676 1.31940212 -1.7429070 -0.15105382 -0.9261854
## 4 0.09674666 -0.09222439 0.59442672 0.1191979 -0.24542070 -0.9261854
## 5 -0.82304940 -1.41848951 -1.17102264 0.3267390 0.07616165 1.0251897
## 6 -0.60655485 0.23650799 -0.01866976 0.2229685 -0.22406508 -0.9261854
## legs
## 1 -0.01475652
## 2 -0.01475652
## 3 -1.22663550
## 4 -1.22663550
## 5 -1.22663550
## 6 -0.01475652
cor_matrix <- cor(df_new_norm)
#cor_matrix
library(corrplot)
## corrplot 0.92 loaded
corrplot(cor_matrix); #cor plot before calculating disruption between planned duration and effective duration
###########performing PCA and summarize the results
data.pca <- princomp(cor_matrix)
summary(data.pca)
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## Standard deviation 0.5155318 0.4759836 0.4031663 0.34157424 0.31579682
## Proportion of Variance 0.2101835 0.1791726 0.1285453 0.09226946 0.07886844
## Cumulative Proportion 0.2101835 0.3893561 0.5179014 0.61017087 0.68903931
## Comp.6 Comp.7 Comp.8 Comp.9 Comp.10
## Standard deviation 0.25008974 0.23647051 0.22837391 0.21805728 0.20656234
## Proportion of Variance 0.04946289 0.04422234 0.04124589 0.03760356 0.03374349
## Cumulative Proportion 0.73850221 0.78272455 0.82397044 0.86157400 0.89531749
## Comp.11 Comp.12 Comp.13 Comp.14 Comp.15
## Standard deviation 0.19969527 0.17276575 0.16535925 0.13628943 0.11490754
## Proportion of Variance 0.03153721 0.02360495 0.02162443 0.01468967 0.01044203
## Cumulative Proportion 0.92685470 0.95045965 0.97208409 0.98677376 0.99721579
## Comp.16 Comp.17 Comp.18 Comp.19
## Standard deviation 0.059063450 5.649199e-03 4.255576e-04 0
## Proportion of Variance 0.002758833 2.523838e-05 1.432202e-07 0
## Cumulative Proportion 0.999974618 9.999999e-01 1.000000e+00 1
library(corrplot)
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.2.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.2.3
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
data.pca$loadings###provides the eigenvalues and vectors
##
## Loadings:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8 Comp.9
## i1_rcs_p 0.512 0.155 0.266 0.116 0.269 0.123
## i1_rcs_e 0.497 0.144 0.272 0.137 0.301 0.131
## i1_dep_1_p -0.325 -0.331 0.484 0.203 0.183
## i1_dep_1_e 0.196 0.111 -0.176 -0.163 0.202 0.460
## i1_rcf_1_p 0.288 0.233 -0.258 -0.327 -0.187 -0.191 -0.144
## i1_rcf_1_e 0.192 0.166 -0.273 -0.355 -0.223 -0.257 -0.166 -0.252
## i1_dlv_p -0.137 -0.256 0.274 0.321 -0.336 0.229
## i1_dlv_e -0.108 -0.102 0.151 -0.422 -0.271 -0.150 0.331
## i1_hops -0.160 -0.125 0.439 -0.323 -0.141 -0.317
## o_rcs_p -0.353 0.343 0.358 -0.269 0.178
## o_rcs_e -0.338 0.319 0.379 -0.291 0.195
## o_dep_1_p -0.435 0.313 0.308 -0.139 -0.172
## o_dep_1_e -0.147 0.214 -0.152 0.253 -0.200 0.321
## o_rcf_1_p -0.116 0.416 -0.117 0.448 -0.115 -0.153 0.123 -0.116
## o_rcf_1_e -0.144 0.375 -0.193 0.437 -0.219 0.126 -0.133
## o_dlv_p -0.283 0.328 -0.365 0.307
## o_dlv_e -0.125 -0.103 0.365 0.252 -0.524 -0.379
## o_hops -0.278 0.175 -0.330 0.331 0.160 -0.180
## legs 0.381 -0.319 0.689 -0.292
## Comp.10 Comp.11 Comp.12 Comp.13 Comp.14 Comp.15 Comp.16 Comp.17
## i1_rcs_p 0.152
## i1_rcs_e 0.110
## i1_dep_1_p -0.455 0.206 0.293 -0.228 0.287
## i1_dep_1_e -0.412 -0.403 0.393 0.153 -0.100 0.304
## i1_rcf_1_p 0.112 -0.316 -0.642 0.228
## i1_rcf_1_e 0.246 0.451 0.477 0.167
## i1_dlv_p -0.409 -0.164 0.365 -0.254 0.410
## i1_dlv_e -0.349 0.635 0.201
## i1_hops -0.250 0.581 -0.271 0.220
## o_rcs_p 0.153
## o_rcs_e -0.118
## o_dep_1_p -0.152 -0.523 -0.251 0.188 0.374
## o_dep_1_e 0.519 0.194 0.454 0.348 -0.111 0.147
## o_rcf_1_p -0.704 0.158
## o_rcf_1_e 0.699 0.113
## o_dlv_p 0.252 0.159 -0.109 -0.456 0.268 0.423
## o_dlv_e -0.479 0.183 0.174 0.103 0.213
## o_hops 0.122 0.556 -0.381 0.288 -0.112 0.169
## legs -0.115 0.286 0.127 0.249
## Comp.18 Comp.19
## i1_rcs_p 0.454 0.543
## i1_rcs_e -0.436 -0.545
## i1_dep_1_p
## i1_dep_1_e
## i1_rcf_1_p
## i1_rcf_1_e
## i1_dlv_p
## i1_dlv_e
## i1_hops
## o_rcs_p -0.549 0.441
## o_rcs_e 0.538 -0.443
## o_dep_1_p
## o_dep_1_e
## o_rcf_1_p
## o_rcf_1_e
## o_dlv_p
## o_dlv_e
## o_hops
## legs
##
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8 Comp.9
## SS loadings 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000
## Proportion Var 0.053 0.053 0.053 0.053 0.053 0.053 0.053 0.053 0.053
## Cumulative Var 0.053 0.105 0.158 0.211 0.263 0.316 0.368 0.421 0.474
## Comp.10 Comp.11 Comp.12 Comp.13 Comp.14 Comp.15 Comp.16 Comp.17
## SS loadings 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000
## Proportion Var 0.053 0.053 0.053 0.053 0.053 0.053 0.053 0.053
## Cumulative Var 0.526 0.579 0.632 0.684 0.737 0.789 0.842 0.895
## Comp.18 Comp.19
## SS loadings 1.000 1.000
## Proportion Var 0.053 0.053
## Cumulative Var 0.947 1.000
fviz_eig(data.pca, addlabels = TRUE)
fviz_pca_var(data.pca, col.var = "black")
fviz_cos2(data.pca, choice = "var", axes = 1:2)
Apply the Grubbs’ test: The Grubbs’ test is a statistical test that can be used to detect a single outlier in a univariate dataset. You can use the grubbs.test() function from the outliers package in R to perform this test. For example:
library(outliers)
grubbs.test(df$i1_rcs_p)
##
## Grubbs test for one outlier
##
## data: df$i1_rcs_p
## G = 9.60565, U = 0.97658, p-value < 2.2e-16
## alternative hypothesis: highest value 47190 is an outlier
Apply the Z-score test: The Z-score test is a statistical test that measures how many standard deviations an observation is from the mean of the dataset. We can use the scale() function in R to compute the Z-scores for each observation. For example
z_scores <- scale(df$i1_rcs_p)
head(z_scores)
## [,1]
## [1,] -0.4280143
## [2,] -0.2902919
## [3,] 0.4647260
## [4,] -0.3084413
## [5,] -0.1295089
## [6,] -0.4318577
We can then use a threshold value to identify outliers. A common threshold is 3, which identifies values that are more than 3 standard deviations away from the mean. For example:
outliers <- which(abs(z_scores) > 3)
outliers
## [1] 18 44 190 283 366 409 491 524 530 535 567 589 603 637 669
## [16] 697 708 738 809 811 813 881 904 914 941 1201 1212 1220 1314 1512
## [31] 1573 1604 1770 1807 1978 2071 2088 2090 2164 2192 2195 2208 2264 2273 2344
## [46] 2525 2527 2556 2586 2616 2638 2727 2790 2874 2989 3000 3024 3029 3062 3068
## [61] 3078 3090 3118 3132 3165 3209 3356 3372 3447 3742 3796 3801 3830
This will return the indices of the observations that are identified as outliers.
Apply the Tukey’s test: The Tukey’s test is a statistical test that identifies outliers in a dataset based on the interquartile range (IQR). We can use the boxplot.stats() function in R to compute the IQR and identify outliers. For example:
boxplot_stats <- boxplot.stats(df$i1_rcs_p)
boxplot_stats
## $stats
## [1] 5 377 1085 1947 4299
##
## $n
## [1] 3942
##
## $conf
## [1] 1045.491 1124.509
##
## $out
## [1] 4380 6145 27415 5442 5877 14854 22224 4845 6286 4909 5083 14220
## [13] 4669 7731 5833 5845 37991 14387 4334 12045 5064 4843 6053 7080
## [25] 20680 4386 10241 4983 8343 4763 4917 5432 8837 39492 4705 6426
## [37] 9107 35769 12192 4859 5473 7054 38054 5618 5478 10861 7129 4310
## [49] 6875 37750 38054 33285 27992 4557 4788 16375 38007 4810 8644 4540
## [61] 25592 4378 19656 38353 24424 16207 12490 4832 37069 7649 4740 4873
## [73] 37994 18950 38354 7646 14919 6691 5864 37993 6875 4508 20219 4590
## [85] 4499 38053 5763 42824 4659 13205 4807 6423 7391 5994 4527 7756
## [97] 9191 6188 5660 14854 5825 7488 8474 4379 4561 5892 6876 14715
## [109] 4499 22664 6165 5943 21505 34676 6050 11344 7441 8035 14665 5056
## [121] 8698 7360 14715 17574 13480 5563 7527 13644 4770 11994 16116 5688
## [133] 4620 5900 4351 5347 14854 13108 8661 10503 4515 5966 15405 13188
## [145] 9980 11732 6420 38054 11924 5008 5730 5755 5685 5888 4889 33151
## [157] 7640 5332 36761 6315 4562 11744 4935 16209 4651 5995 14854 7657
## [169] 5985 5787 4361 5823 27526 9076 8690 13087 4475 37993 10317 6477
## [181] 4797 8543 6831 16031 11564 4557 4886 5677 21865 4516 9145 13719
## [193] 4802 4864 8691 9880 5634 9576 4552 6231 7599 4406 30364 8413
## [205] 4791 25575 37506 6908 6622 5058 10431 4982 11694 7551 4879 16016
## [217] 7445 9687 31353 6959 4972 7367 7089 20143 37994 5971 21559 5937
## [229] 5924 10550 6265 38053 5285 21501 6153 12870 4539 7920 5599 4816
## [241] 5545 18945 4853 8424 9105 4402 6030 4523 6107 4368 4318 5252
## [253] 8033 14919 4772 5662 4676 4530 4831 6138 8192 8308 40198 47190
## [265] 11881 27570 11320 6013 4410 4704 38353 6624 5811 6766 5995 37411
## [277] 18566 9896 10117 4452 4590 5659 6175 6887 14639 17492 4724 8597
## [289] 6077 8350 17335 15866 9687 14854 8634 14854 6074 8513 37328 6739
## [301] 5718 4770 11708 4336 4650 4845 5502 11741 4887 4767 7591 37993
## [313] 33458 4379 4580 37993 39648 4311 6180 23598 37919 6747 32416 6696
## [325] 38053 11184 35342 37601 9056 11835 11490 39491 6945 4312 7302 6997
## [337] 4747 14854 23234 5648 4732 4575 10218 11579 12661 33087 6000 5527
## [349] 4350 26944 14715 4552 6662 11999 31617 4665 9438 4602 5850 4385
## [361] 13153 5830 4688 10980 5774 13479 17586 7165 5382 5042 12915 14855
## [373] 42990 37994 32519 10565 5537 4649 6847 5334 16183 7666 14919 5844
## [385] 4614 4772
ggplot2: ggplot2 is a powerful package for creating visualizations in R. We can use it to create various types of plots like scatterplots, boxplots, histograms, and more to explore the relationships and distributions of variables in the dataset. For example, We use ggplot2 to create a scatterplot of two variables with the following code:
library(ggplot2)
ggplot(df, aes(x = i1_rcs_p , y = i1_rcs_e), colour = 'red',
width = 0.4) +
geom_point()
dplyr and tidyr: dplyr and tidyr are packages for manipulating and cleaning data in R. We can use them to filter, group, summarize, and reshape the data to extract meaningful insights. For example, you could use dplyr and tidyr to calculate summary statistics like mean, median, and standard deviation for a specific variable with the following code:
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.2.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
df %>%
summarize(mean = mean(i1_rcs_p),
median = median(i1_rcs_p),
sd = sd(i1_rcs_p))
stats: stats is a built-in package in R that contains various statistical functions and tests. We can use it to perform hypothesis testing, calculate p-values, and more. For example, we could use stats to perform a t-test between two groups of data with the following code:
#t.test(i1_rcs_p ~ i1_rcs_e, data = df, paired = TRUE)
finding distubtion between plabbed duration and effective duration
df_new$i1_rcs_pe <- df$i1_rcs_p-df$i1_rcs_e
df_new$i1_dep_1_pe <- df$i1_dep_1_p-df$i1_dep_1_e
df_new$i1_rcf_1_pe <- df$i1_rcf_1_p-df$i1_rcf_1_e
df_new$i1_dlv_pe <- df$i1_dlv_p-df$i1_dlv_e
df_new$o_rcs_pe <- df$o_rcs_p-df$o_rcs_e
df_new$o_dep_1_pe <- df$o_dep_1_p-df$o_dep_1_e
df_new$o_rcf_1_pe <- df$o_rcf_1_p-df$o_rcf_1_e
df_new$o_dlv_pe <- df$o_dlv_p-df$o_dlv_e
dim(df_new);head(df_new)
## [1] 3942 27
Correlation between new disrupted variable
cor_matrix <- cor(df_new[20:27])
cor_matrix
## i1_rcs_pe i1_dep_1_pe i1_rcf_1_pe i1_dlv_pe o_rcs_pe
## i1_rcs_pe 1.000000000 -0.931206906 -0.0054907363 -0.019022511 0.014507883
## i1_dep_1_pe -0.931206906 1.000000000 -0.0308656464 0.027337464 -0.021876622
## i1_rcf_1_pe -0.005490736 -0.030865646 1.0000000000 0.001988295 -0.050860947
## i1_dlv_pe -0.019022511 0.027337464 0.0019882952 1.000000000 -0.008196962
## o_rcs_pe 0.014507883 -0.021876622 -0.0508609473 -0.008196962 1.000000000
## o_dep_1_pe -0.015593720 0.018061415 0.0530139448 0.015381367 -0.923032723
## o_rcf_1_pe 0.039677539 -0.047682815 0.0137866734 0.054912037 0.045102564
## o_dlv_pe -0.006770061 0.009598674 0.0001278906 0.005106971 0.007111367
## o_dep_1_pe o_rcf_1_pe o_dlv_pe
## i1_rcs_pe -0.015593720 0.03967754 -0.0067700605
## i1_dep_1_pe 0.018061415 -0.04768282 0.0095986744
## i1_rcf_1_pe 0.053013945 0.01378667 0.0001278906
## i1_dlv_pe 0.015381367 0.05491204 0.0051069706
## o_rcs_pe -0.923032723 0.04510256 0.0071113666
## o_dep_1_pe 1.000000000 -0.06365526 -0.0044848221
## o_rcf_1_pe -0.063655265 1.00000000 -0.0333829944
## o_dlv_pe -0.004484822 -0.03338299 1.0000000000
corrplot(cor_matrix, method=c("ellipse"),type = "lower", title = "Female's Correlated Data", addCoef.col = "black", cex.main=0.5)
eig <- eigen(cor_matrix)
eig
## eigen() decomposition
## $values
## [1] 1.97443005 1.89722087 1.06218728 1.00376446 0.99411374 0.92356317 0.07740876
## [8] 0.06731167
##
## $vectors
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 0.501815133 0.49483081 0.02764096 0.021447940 -0.041255808 0.03844461
## [2,] -0.504179085 -0.49358737 -0.01925220 -0.017776094 0.000863130 -0.03564541
## [3,] -0.037868933 0.07195946 -0.19886890 0.086091376 0.955973310 0.17823000
## [4,] -0.030415887 -0.01297102 -0.61516301 0.467408380 -0.276478249 0.57051315
## [5,] 0.490724834 -0.50245227 0.02470521 0.010121058 0.050640511 0.06558617
## [6,] -0.491121724 0.50332581 -0.01570473 -0.003076132 -0.050302218 -0.04561641
## [7,] 0.097861507 -0.01197810 -0.69183318 -0.034141179 -0.003030366 -0.71429912
## [8,] -0.006128237 -0.01513055 0.31843496 0.878671815 0.053554302 -0.35127567
## [,7] [,8]
## [1,] 0.1834615202 -0.682101050
## [2,] 0.1819033307 -0.683467745
## [3,] 0.0090788171 -0.026341764
## [4,] 0.0044731153 0.008062990
## [5,] -0.6818744614 -0.184930477
## [6,] -0.6841308957 -0.180444534
## [7,] -0.0127258070 -0.009301505
## [8,] 0.0008975133 0.002251604
####################################### PCA
#perfoming PCA for both data
df_pca <- princomp(cor_matrix)
df_pca
## Call:
## princomp(x = cor_matrix)
##
## Standard deviations:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7
## 0.69805064 0.67072851 0.36871906 0.35211344 0.32748945 0.25757418 0.02653907
## Comp.8
## 0.00000000
##
## 8 variables and 8 observations.
summary(df_pca)
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## Standard deviation 0.6980506 0.6707285 0.36871906 0.35211344 0.3274894
## Proportion of Variance 0.3553152 0.3280450 0.09913593 0.09040764 0.0782050
## Cumulative Proportion 0.3553152 0.6833602 0.78249615 0.87290379 0.9511088
## Comp.6 Comp.7 Comp.8
## Standard deviation 0.25757418 0.0265390716 0
## Proportion of Variance 0.04837763 0.0005135839 0
## Cumulative Proportion 0.99948642 1.0000000000 1
find the negative values in i1_rcs_pe
neg_i1_rcs_pe <- df_new$i1_rcs_pe[df_new$i1_rcs_pe<0]
# print the negative values
if (length(neg_i1_rcs_pe) > 0) {
cat("Negative values found:", neg_i1_rcs_pe, "\n")
} else {
cat("No negative values found\n")
}
## Negative values found: -19 -13 -488 -198 -24 -151 -87 -639 -60 -22 -153 -12 -108 -2 -212 -44 -100 -20 -99 -68 -36 -10 -600 -1 -7 -82 -24 -4 -27 -100 -20 -12 -90 -99 -1 -59 -36 -94 -18 -90 -20 -21 -28 -90 -29 -41 -32 -6 -1 -16 -3 -34 -47 -780 -10 -56 -64 -63 -11 -5 -32 -18 -22 -199 -54 -7 -4 -11 -80 -142 -19 -47 -126 -4 -82 -305 -40 -688 -46 -204 -77 -11 -38 -15 -8 -2 -41 -29 -74 -1200 -76 -13 -10 -28 -30 -109 -1 -65 -75 -7 -16 -8 -55 -19 -25 -23 -3 -2 -33 -11 -219 -419 -6 -4 -63 -15 -10 -41 -137 -8 -17 -20 -197 -12 -15 -10 -300 -70 -90 -50 -65 -656 -38 -8 -10 -22 -26 -85 -66 -2 -45 -65 -15 -15 -209 -424 -94 -93 -1 -230 -9 -10 -17 -4 -68 -42 -22 -6 -40 -41 -1 -25 -9 -54 -47 -9 -420 -316 -68 -53 -60 -43 -109 -227 -320 -179 -18 -14 -76 -26 -872 -60 -75 -25 -20 -4 -611 -18 -3 -47 -42 -6 -47 -128 -82 -17 -6 -11 -305 -90 -13 -5 -5 -30 -37 -41 -2 -91 -298 -45 -6 -15 -28 -11 -48 -1 -5 -67 -32 -51 -105 -402 -70 -889 -5 -19 -1 -1 -8 -25 -107 -620 -39 -56 -8 -16 -366 -6
print("total negative value:")
## [1] "total negative value:"
print(length(neg_i1_rcs_pe))
## [1] 238
library(nFactors)
## Warning: package 'nFactors' was built under R version 4.2.3
## Loading required package: lattice
##
## Attaching package: 'nFactors'
## The following object is masked from 'package:lattice':
##
## parallel
################loading/presure to select component/variation of the component
load<-df_pca$loadings
str(load) #load er structure ta ki
## 'loadings' num [1:8, 1:8] 0.5013 -0.5038 -0.0394 -0.0321 0.4911 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:8] "i1_rcs_pe" "i1_dep_1_pe" "i1_rcf_1_pe" "i1_dlv_pe" ...
## ..$ : chr [1:8] "Comp.1" "Comp.2" "Comp.3" "Comp.4" ...
load[,1]
## i1_rcs_pe i1_dep_1_pe i1_rcf_1_pe i1_dlv_pe o_rcs_pe o_dep_1_pe
## 0.501342342 -0.503804494 -0.039410827 -0.032081104 0.491108387 -0.491661356
## o_rcf_1_pe o_dlv_pe
## 0.096328808 -0.007543362
sum(load[,1]^2) ##ss loading for component 1
## [1] 1
###########screeplot
fviz_eig(df_pca)
##########dim = 2<5
fviz_pca_var(df_pca, col.var = "black")
fviz_cos2(df_pca,choice ="var",axes=1:2)