R Notebook

#read csv file

df <- read.csv("c2k_data_comma.csv")
head(df)

dim(df)

## [1] 3943   98

tail(df)

summary(df)

##        nr            i1_legid        i1_rcs_p          i1_rcs_e    
##  Min.   :   0.0   Min.   :    1   Min.   :    5.0   Min.   :    1  
##  1st Qu.: 986.2   1st Qu.: 3591   1st Qu.:  377.8   1st Qu.:  113  
##  Median :1971.5   Median : 7430   Median : 1085.0   Median :  340  
##  Mean   :1971.3   Mean   : 7304   Mean   : 2203.5   Mean   : 1653  
##  3rd Qu.:2956.8   3rd Qu.:10922   3rd Qu.: 1946.5   3rd Qu.: 1375  
##  Max.   :3942.0   Max.   :14661   Max.   :47190.0   Max.   :46357  
##  NA's   :1        NA's   :1       NA's   :1         NA's   :1      
##    i1_dep_1_p       i1_dep_1_e     i1_dep_1_place    i1_rcf_1_p  
##  Min.   :  75.0   Min.   :   4.0   Min.   :101.0   Min.   :  50  
##  1st Qu.: 120.0   1st Qu.: 263.0   1st Qu.:308.0   1st Qu.: 455  
##  Median : 180.0   Median : 516.0   Median :485.0   Median : 820  
##  Mean   : 205.9   Mean   : 711.4   Mean   :503.7   Mean   : 796  
##  3rd Qu.: 240.0   3rd Qu.: 949.8   3rd Qu.:700.0   3rd Qu.:1020  
##  Max.   :2876.0   Max.   :9513.0   Max.   :815.0   Max.   :5001  
##  NA's   :1        NA's   :1        NA's   :1       NA's   :1     
##    i1_rcf_1_e      i1_rcf_1_place   i1_dep_2_p         i1_dep_2_e       
##  Min.   :   13.0   Min.   :101.0   Length:3943        Length:3943       
##  1st Qu.:  274.0   1st Qu.:281.0   Class :character   Class :character  
##  Median :  657.5   Median :485.0   Mode  :character   Mode  :character  
##  Mean   :  666.4   Mean   :466.7                                        
##  3rd Qu.:  883.0   3rd Qu.:700.0                                        
##  Max.   :38116.0   Max.   :815.0                                        
##  NA's   :1         NA's   :1                                            
##  i1_dep_2_place      i1_rcf_2_p         i1_rcf_2_e        i1_rcf_2_place    
##  Length:3943        Length:3943        Length:3943        Length:3943       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##   i1_dep_3_p         i1_dep_3_e        i1_dep_3_place      i1_rcf_3_p       
##  Length:3943        Length:3943        Length:3943        Length:3943       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##   i1_rcf_3_e        i1_rcf_3_place        i1_dlv_p       i1_dlv_e    
##  Length:3943        Length:3943        Min.   : 180   Min.   :    1  
##  Class :character   Class :character   1st Qu.:3420   1st Qu.:  212  
##  Mode  :character   Mode  :character   Median :3780   Median :  863  
##                                        Mean   :3694   Mean   : 2251  
##                                        3rd Qu.:4230   3rd Qu.: 2176  
##                                        Max.   :9915   Max.   :90977  
##                                        NA's   :1      NA's   :1      
##     i1_hops        i2_legid           i2_rcs_p           i2_rcs_e        
##  Min.   :1.000   Length:3943        Length:3943        Length:3943       
##  1st Qu.:1.000   Class :character   Class :character   Class :character  
##  Median :1.000   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :1.309                                                           
##  3rd Qu.:2.000                                                           
##  Max.   :3.000                                                           
##  NA's   :1                                                               
##   i2_dep_1_p         i2_dep_1_e        i2_dep_1_place      i2_rcf_1_p       
##  Length:3943        Length:3943        Length:3943        Length:3943       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##   i2_rcf_1_e        i2_rcf_1_place      i2_dep_2_p         i2_dep_2_e       
##  Length:3943        Length:3943        Length:3943        Length:3943       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  i2_dep_2_place      i2_rcf_2_p         i2_rcf_2_e        i2_rcf_2_place    
##  Length:3943        Length:3943        Length:3943        Length:3943       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##   i2_dep_3_p         i2_dep_3_e        i2_dep_3_place      i2_rcf_3_p       
##  Length:3943        Length:3943        Length:3943        Length:3943       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##   i2_rcf_3_e        i2_rcf_3_place       i2_dlv_p           i2_dlv_e        
##  Length:3943        Length:3943        Length:3943        Length:3943       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##    i2_hops            i3_legid           i3_rcs_p           i3_rcs_e        
##  Length:3943        Length:3943        Length:3943        Length:3943       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##   i3_dep_1_p         i3_dep_1_e        i3_dep_1_place      i3_rcf_1_p       
##  Length:3943        Length:3943        Length:3943        Length:3943       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##   i3_rcf_1_e        i3_rcf_1_place      i3_dep_2_p         i3_dep_2_e       
##  Length:3943        Length:3943        Length:3943        Length:3943       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  i3_dep_2_place      i3_rcf_2_p         i3_rcf_2_e        i3_rcf_2_place    
##  Length:3943        Length:3943        Length:3943        Length:3943       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##   i3_dep_3_p         i3_dep_3_e        i3_dep_3_place      i3_rcf_3_p       
##  Length:3943        Length:3943        Length:3943        Length:3943       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##   i3_rcf_3_e        i3_rcf_3_place       i3_dlv_p           i3_dlv_e        
##  Length:3943        Length:3943        Length:3943        Length:3943       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##    i3_hops             o_legid         o_rcs_p         o_rcs_e     
##  Length:3943        Min.   :   20   Min.   :    5   Min.   :    1  
##  Class :character   1st Qu.: 3674   1st Qu.:  746   1st Qu.:  241  
##  Mode  :character   Median : 7201   Median : 1392   Median :  717  
##                     Mean   : 7325   Mean   : 2766   Mean   : 2140  
##                     3rd Qu.:11097   3rd Qu.: 2717   3rd Qu.: 1840  
##                     Max.   :14660   Max.   :49136   Max.   :49015  
##                     NA's   :1       NA's   :1       NA's   :1      
##    o_dep_1_p        o_dep_1_e       o_dep_1_place     o_rcf_1_p     
##  Min.   :  90.0   Min.   :   18.0   Min.   :101.0   Min.   :  45.0  
##  1st Qu.:  90.0   1st Qu.:  303.0   1st Qu.:341.0   1st Qu.: 630.0  
##  Median : 180.0   Median :  600.0   Median :485.0   Median : 840.0  
##  Mean   : 207.1   Mean   :  803.9   Mean   :479.7   Mean   : 855.7  
##  3rd Qu.: 240.0   3rd Qu.: 1029.5   3rd Qu.:671.0   3rd Qu.:1050.0  
##  Max.   :1620.0   Max.   :12256.0   Max.   :815.0   Max.   :4986.0  
##  NA's   :1        NA's   :1         NA's   :1       NA's   :1       
##    o_rcf_1_e      o_rcf_1_place    o_dep_2_p          o_dep_2_e        
##  Min.   :   1.0   Min.   :100.0   Length:3943        Length:3943       
##  1st Qu.: 385.0   1st Qu.:206.0   Class :character   Class :character  
##  Median : 658.5   Median :476.0   Mode  :character   Mode  :character  
##  Mean   : 690.3   Mean   :461.3                                        
##  3rd Qu.: 920.0   3rd Qu.:700.0                                        
##  Max.   :5520.0   Max.   :815.0                                        
##  NA's   :1        NA's   :1                                            
##  o_dep_2_place       o_rcf_2_p          o_rcf_2_e         o_rcf_2_place     
##  Length:3943        Length:3943        Length:3943        Length:3943       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##   o_dep_3_p          o_dep_3_e         o_dep_3_place       o_rcf_3_p        
##  Length:3943        Length:3943        Length:3943        Length:3943       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##   o_rcf_3_e         o_rcf_3_place         o_dlv_p         o_dlv_e      
##  Length:3943        Length:3943        Min.   :  240   Min.   :     1  
##  Class :character   Class :character   1st Qu.: 2880   1st Qu.:   400  
##  Mode  :character   Mode  :character   Median : 3780   Median :  1202  
##                                        Mean   : 3573   Mean   :  3699  
##                                        3rd Qu.: 4080   3rd Qu.:  3174  
##                                        Max.   :11520   Max.   :560130  
##                                        NA's   :1       NA's   :1       
##      o_hops           legs      
##  Min.   :1.000   Min.   :1.000  
##  1st Qu.:1.000   1st Qu.:1.000  
##  Median :1.000   Median :2.000  
##  Mean   :1.475   Mean   :2.012  
##  3rd Qu.:2.000   3rd Qu.:3.000  
##  Max.   :3.000   Max.   :3.000  
##  NA's   :1       NA's   :1

• RCS: Check in freight at departure airline. Shipment is checked in and a receipt is produced at departure airport. • DEP: Confirm goods on board. Aircraft has departed with shipment on board. • RCF: Accept freight at arrival airline. Shipment is checked in according to the documents and stored at arrival warehouse. • DLV: Deliver freight. Receipt of shipment was signed at destination airport.

Null Data from the dataset

colSums(is.na(df))

##             nr       i1_legid       i1_rcs_p       i1_rcs_e     i1_dep_1_p 
##              1              1              1              1              1 
##     i1_dep_1_e i1_dep_1_place     i1_rcf_1_p     i1_rcf_1_e i1_rcf_1_place 
##              1              1              1              1              1 
##     i1_dep_2_p     i1_dep_2_e i1_dep_2_place     i1_rcf_2_p     i1_rcf_2_e 
##              0              0              0              0              0 
## i1_rcf_2_place     i1_dep_3_p     i1_dep_3_e i1_dep_3_place     i1_rcf_3_p 
##              0              0              0              0              0 
##     i1_rcf_3_e i1_rcf_3_place       i1_dlv_p       i1_dlv_e        i1_hops 
##              0              0              1              1              1 
##       i2_legid       i2_rcs_p       i2_rcs_e     i2_dep_1_p     i2_dep_1_e 
##              0              0              0              0              0 
## i2_dep_1_place     i2_rcf_1_p     i2_rcf_1_e i2_rcf_1_place     i2_dep_2_p 
##              0              0              0              0              0 
##     i2_dep_2_e i2_dep_2_place     i2_rcf_2_p     i2_rcf_2_e i2_rcf_2_place 
##              0              0              0              0              0 
##     i2_dep_3_p     i2_dep_3_e i2_dep_3_place     i2_rcf_3_p     i2_rcf_3_e 
##              0              0              0              0              0 
## i2_rcf_3_place       i2_dlv_p       i2_dlv_e        i2_hops       i3_legid 
##              0              0              0              0              0 
##       i3_rcs_p       i3_rcs_e     i3_dep_1_p     i3_dep_1_e i3_dep_1_place 
##              0              0              0              0              0 
##     i3_rcf_1_p     i3_rcf_1_e i3_rcf_1_place     i3_dep_2_p     i3_dep_2_e 
##              0              0              0              0              0 
## i3_dep_2_place     i3_rcf_2_p     i3_rcf_2_e i3_rcf_2_place     i3_dep_3_p 
##              0              0              0              0              0 
##     i3_dep_3_e i3_dep_3_place     i3_rcf_3_p     i3_rcf_3_e i3_rcf_3_place 
##              0              0              0              0              0 
##       i3_dlv_p       i3_dlv_e        i3_hops        o_legid        o_rcs_p 
##              0              0              0              1              1 
##        o_rcs_e      o_dep_1_p      o_dep_1_e  o_dep_1_place      o_rcf_1_p 
##              1              1              1              1              1 
##      o_rcf_1_e  o_rcf_1_place      o_dep_2_p      o_dep_2_e  o_dep_2_place 
##              1              1              0              0              0 
##      o_rcf_2_p      o_rcf_2_e  o_rcf_2_place      o_dep_3_p      o_dep_3_e 
##              0              0              0              0              0 
##  o_dep_3_place      o_rcf_3_p      o_rcf_3_e  o_rcf_3_place        o_dlv_p 
##              0              0              0              0              1 
##        o_dlv_e         o_hops           legs 
##              1              1              1

# Identify rows with missing values
missing_rows <- complete.cases(df)
# Create a new data set that excludes the rows with missing values #row 30-33 missing value
df <- df[missing_rows,]
head(df)

df_num <- df[, sapply(df, is.numeric)] / 2
# view new data frame with numeric columns divided by scalar value
df_num;dim(df_num)

## [1] 3942   26

# drop column with unique id  using subset()
df_new <- subset(df_num, select = -c(nr,i1_legid,i1_dep_1_place,i1_rcf_1_place,o_legid,o_dep_1_place,o_rcf_1_place))
dim(df_new) #remove this columns form the df_num

## [1] 3942   19

df_new_norm <- scale(df_new)
head(df_new_norm)

##     i1_rcs_p    i1_rcs_e i1_dep_1_p  i1_dep_1_e i1_rcf_1_p  i1_rcf_1_e
## 1 -0.4280143 -0.31518862  0.0292841 -0.72490767  0.3159088  0.09113646
## 2 -0.2902919 -0.23481716 -0.8261285 -0.60516278  2.5886796  0.97971022
## 3  0.4647260  0.54144812 -0.8261285 -0.62998794  0.2477257 -0.15619850
## 4 -0.3084413 -0.32594873  0.2431373  0.09578287 -1.0363898 -0.11693899
## 5 -0.1295089 -0.03696284 -0.3984222 -0.68693978 -0.4795610 -0.07113621
## 6 -0.4318577 -0.34153991  0.2431373 -0.66065432 -1.3204862 -0.58020130
##      i1_dlv_p   i1_dlv_e    i1_hops     o_rcs_p    o_rcs_e  o_dep_1_p
## 1 -1.67688092 -0.1274864 -0.6510436  1.47547899  1.1476822  0.2085311
## 2  0.05037233  0.6336340 -0.6510436  0.05445461  0.1854621  0.5891651
## 3  0.05037233 -0.3455632 -0.6510436 -0.27496561 -0.3173577  0.2085311
## 4 -1.97356898 -0.3072476  1.4560285  0.24690863  0.2193505 -0.7430537
## 5  0.31474783 -0.1783352  1.4560285 -0.53920313 -0.4154224 -0.1721028
## 6  0.70249856 -0.3824465 -0.6510436 -0.54250352 -0.4471929  0.2085311
##     o_dep_1_e   o_rcf_1_p   o_rcf_1_e    o_dlv_p     o_dlv_e     o_hops
## 1  2.52099640  1.07534234  1.28583845 -1.6103113 -0.22635318 -0.9261854
## 2 -0.63346378 -0.04688199 -0.36773199  0.1710832 -0.22559048 -0.9261854
## 3  0.20193610  0.78061676  1.31940212 -1.7429070 -0.15105382 -0.9261854
## 4  0.09674666 -0.09222439  0.59442672  0.1191979 -0.24542070 -0.9261854
## 5 -0.82304940 -1.41848951 -1.17102264  0.3267390  0.07616165  1.0251897
## 6 -0.60655485  0.23650799 -0.01866976  0.2229685 -0.22406508 -0.9261854
##          legs
## 1 -0.01475652
## 2 -0.01475652
## 3 -1.22663550
## 4 -1.22663550
## 5 -1.22663550
## 6 -0.01475652

cor_matrix <- cor(df_new_norm)
#cor_matrix
library(corrplot)

## corrplot 0.92 loaded

corrplot(cor_matrix); #cor plot before calculating disruption between planned duration and effective duration
###########performing PCA and summarize the results
data.pca <- princomp(cor_matrix)
summary(data.pca)

## Importance of components:
##                           Comp.1    Comp.2    Comp.3     Comp.4     Comp.5
## Standard deviation     0.5155318 0.4759836 0.4031663 0.34157424 0.31579682
## Proportion of Variance 0.2101835 0.1791726 0.1285453 0.09226946 0.07886844
## Cumulative Proportion  0.2101835 0.3893561 0.5179014 0.61017087 0.68903931
##                            Comp.6     Comp.7     Comp.8     Comp.9    Comp.10
## Standard deviation     0.25008974 0.23647051 0.22837391 0.21805728 0.20656234
## Proportion of Variance 0.04946289 0.04422234 0.04124589 0.03760356 0.03374349
## Cumulative Proportion  0.73850221 0.78272455 0.82397044 0.86157400 0.89531749
##                           Comp.11    Comp.12    Comp.13    Comp.14    Comp.15
## Standard deviation     0.19969527 0.17276575 0.16535925 0.13628943 0.11490754
## Proportion of Variance 0.03153721 0.02360495 0.02162443 0.01468967 0.01044203
## Cumulative Proportion  0.92685470 0.95045965 0.97208409 0.98677376 0.99721579
##                            Comp.16      Comp.17      Comp.18 Comp.19
## Standard deviation     0.059063450 5.649199e-03 4.255576e-04       0
## Proportion of Variance 0.002758833 2.523838e-05 1.432202e-07       0
## Cumulative Proportion  0.999974618 9.999999e-01 1.000000e+00       1

library(corrplot)
library(factoextra)

## Warning: package 'factoextra' was built under R version 4.2.3

## Loading required package: ggplot2

## Warning: package 'ggplot2' was built under R version 4.2.3

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

data.pca$loadings###provides the eigenvalues and vectors

## 
## Loadings:
##            Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8 Comp.9
## i1_rcs_p    0.512  0.155  0.266  0.116  0.269         0.123              
## i1_rcs_e    0.497  0.144  0.272  0.137  0.301         0.131              
## i1_dep_1_p               -0.325 -0.331                0.484  0.203  0.183
## i1_dep_1_e  0.196  0.111        -0.176 -0.163                0.202  0.460
## i1_rcf_1_p  0.288  0.233        -0.258 -0.327 -0.187 -0.191        -0.144
## i1_rcf_1_e  0.192  0.166        -0.273 -0.355 -0.223 -0.257 -0.166 -0.252
## i1_dlv_p   -0.137 -0.256  0.274  0.321               -0.336         0.229
## i1_dlv_e          -0.108 -0.102         0.151 -0.422 -0.271 -0.150  0.331
## i1_hops           -0.160 -0.125         0.439 -0.323 -0.141        -0.317
## o_rcs_p    -0.353  0.343  0.358 -0.269  0.178                            
## o_rcs_e    -0.338  0.319  0.379 -0.291  0.195                            
## o_dep_1_p                -0.435         0.313  0.308 -0.139 -0.172       
## o_dep_1_e  -0.147  0.214 -0.152                0.253        -0.200  0.321
## o_rcf_1_p  -0.116  0.416 -0.117  0.448 -0.115 -0.153  0.123        -0.116
## o_rcf_1_e  -0.144  0.375 -0.193  0.437        -0.219  0.126        -0.133
## o_dlv_p           -0.283  0.328        -0.365         0.307              
## o_dlv_e           -0.125               -0.103  0.365  0.252 -0.524 -0.379
## o_hops            -0.278                0.175 -0.330  0.331  0.160 -0.180
## legs                                           0.381 -0.319  0.689 -0.292
##            Comp.10 Comp.11 Comp.12 Comp.13 Comp.14 Comp.15 Comp.16 Comp.17
## i1_rcs_p                                                            0.152 
## i1_rcs_e            0.110                                                 
## i1_dep_1_p                 -0.455   0.206   0.293  -0.228           0.287 
## i1_dep_1_e -0.412  -0.403   0.393   0.153  -0.100   0.304                 
## i1_rcf_1_p                  0.112          -0.316  -0.642           0.228 
## i1_rcf_1_e  0.246                           0.451   0.477           0.167 
## i1_dlv_p           -0.409  -0.164           0.365  -0.254           0.410 
## i1_dlv_e   -0.349   0.635                                           0.201 
## i1_hops            -0.250           0.581  -0.271                   0.220 
## o_rcs_p                                                             0.153 
## o_rcs_e    -0.118                                                         
## o_dep_1_p          -0.152          -0.523  -0.251   0.188           0.374 
## o_dep_1_e   0.519   0.194   0.454   0.348          -0.111           0.147 
## o_rcf_1_p                                                  -0.704   0.158 
## o_rcf_1_e                                                   0.699   0.113 
## o_dlv_p     0.252   0.159  -0.109          -0.456   0.268           0.423 
## o_dlv_e    -0.479           0.183   0.174   0.103                   0.213 
## o_hops      0.122           0.556  -0.381   0.288  -0.112           0.169 
## legs       -0.115   0.286   0.127                                   0.249 
##            Comp.18 Comp.19
## i1_rcs_p    0.454   0.543 
## i1_rcs_e   -0.436  -0.545 
## i1_dep_1_p                
## i1_dep_1_e                
## i1_rcf_1_p                
## i1_rcf_1_e                
## i1_dlv_p                  
## i1_dlv_e                  
## i1_hops                   
## o_rcs_p    -0.549   0.441 
## o_rcs_e     0.538  -0.443 
## o_dep_1_p                 
## o_dep_1_e                 
## o_rcf_1_p                 
## o_rcf_1_e                 
## o_dlv_p                   
## o_dlv_e                   
## o_hops                    
## legs                      
## 
##                Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8 Comp.9
## SS loadings     1.000  1.000  1.000  1.000  1.000  1.000  1.000  1.000  1.000
## Proportion Var  0.053  0.053  0.053  0.053  0.053  0.053  0.053  0.053  0.053
## Cumulative Var  0.053  0.105  0.158  0.211  0.263  0.316  0.368  0.421  0.474
##                Comp.10 Comp.11 Comp.12 Comp.13 Comp.14 Comp.15 Comp.16 Comp.17
## SS loadings      1.000   1.000   1.000   1.000   1.000   1.000   1.000   1.000
## Proportion Var   0.053   0.053   0.053   0.053   0.053   0.053   0.053   0.053
## Cumulative Var   0.526   0.579   0.632   0.684   0.737   0.789   0.842   0.895
##                Comp.18 Comp.19
## SS loadings      1.000   1.000
## Proportion Var   0.053   0.053
## Cumulative Var   0.947   1.000

fviz_eig(data.pca, addlabels = TRUE)

fviz_pca_var(data.pca, col.var = "black")

fviz_cos2(data.pca, choice = "var", axes = 1:2)

Apply the Grubbs’ test: The Grubbs’ test is a statistical test that can be used to detect a single outlier in a univariate dataset. You can use the grubbs.test() function from the outliers package in R to perform this test. For example:

library(outliers)
grubbs.test(df$i1_rcs_p)

## 
##  Grubbs test for one outlier
## 
## data:  df$i1_rcs_p
## G = 9.60565, U = 0.97658, p-value < 2.2e-16
## alternative hypothesis: highest value 47190 is an outlier

Apply the Z-score test: The Z-score test is a statistical test that measures how many standard deviations an observation is from the mean of the dataset. We can use the scale() function in R to compute the Z-scores for each observation. For example

z_scores <- scale(df$i1_rcs_p)
head(z_scores)

##            [,1]
## [1,] -0.4280143
## [2,] -0.2902919
## [3,]  0.4647260
## [4,] -0.3084413
## [5,] -0.1295089
## [6,] -0.4318577

We can then use a threshold value to identify outliers. A common threshold is 3, which identifies values that are more than 3 standard deviations away from the mean. For example:

outliers <- which(abs(z_scores) > 3)
outliers

##  [1]   18   44  190  283  366  409  491  524  530  535  567  589  603  637  669
## [16]  697  708  738  809  811  813  881  904  914  941 1201 1212 1220 1314 1512
## [31] 1573 1604 1770 1807 1978 2071 2088 2090 2164 2192 2195 2208 2264 2273 2344
## [46] 2525 2527 2556 2586 2616 2638 2727 2790 2874 2989 3000 3024 3029 3062 3068
## [61] 3078 3090 3118 3132 3165 3209 3356 3372 3447 3742 3796 3801 3830

This will return the indices of the observations that are identified as outliers.

Apply the Tukey’s test: The Tukey’s test is a statistical test that identifies outliers in a dataset based on the interquartile range (IQR). We can use the boxplot.stats() function in R to compute the IQR and identify outliers. For example:

boxplot_stats <- boxplot.stats(df$i1_rcs_p)
boxplot_stats

## $stats
## [1]    5  377 1085 1947 4299
## 
## $n
## [1] 3942
## 
## $conf
## [1] 1045.491 1124.509
## 
## $out
##   [1]  4380  6145 27415  5442  5877 14854 22224  4845  6286  4909  5083 14220
##  [13]  4669  7731  5833  5845 37991 14387  4334 12045  5064  4843  6053  7080
##  [25] 20680  4386 10241  4983  8343  4763  4917  5432  8837 39492  4705  6426
##  [37]  9107 35769 12192  4859  5473  7054 38054  5618  5478 10861  7129  4310
##  [49]  6875 37750 38054 33285 27992  4557  4788 16375 38007  4810  8644  4540
##  [61] 25592  4378 19656 38353 24424 16207 12490  4832 37069  7649  4740  4873
##  [73] 37994 18950 38354  7646 14919  6691  5864 37993  6875  4508 20219  4590
##  [85]  4499 38053  5763 42824  4659 13205  4807  6423  7391  5994  4527  7756
##  [97]  9191  6188  5660 14854  5825  7488  8474  4379  4561  5892  6876 14715
## [109]  4499 22664  6165  5943 21505 34676  6050 11344  7441  8035 14665  5056
## [121]  8698  7360 14715 17574 13480  5563  7527 13644  4770 11994 16116  5688
## [133]  4620  5900  4351  5347 14854 13108  8661 10503  4515  5966 15405 13188
## [145]  9980 11732  6420 38054 11924  5008  5730  5755  5685  5888  4889 33151
## [157]  7640  5332 36761  6315  4562 11744  4935 16209  4651  5995 14854  7657
## [169]  5985  5787  4361  5823 27526  9076  8690 13087  4475 37993 10317  6477
## [181]  4797  8543  6831 16031 11564  4557  4886  5677 21865  4516  9145 13719
## [193]  4802  4864  8691  9880  5634  9576  4552  6231  7599  4406 30364  8413
## [205]  4791 25575 37506  6908  6622  5058 10431  4982 11694  7551  4879 16016
## [217]  7445  9687 31353  6959  4972  7367  7089 20143 37994  5971 21559  5937
## [229]  5924 10550  6265 38053  5285 21501  6153 12870  4539  7920  5599  4816
## [241]  5545 18945  4853  8424  9105  4402  6030  4523  6107  4368  4318  5252
## [253]  8033 14919  4772  5662  4676  4530  4831  6138  8192  8308 40198 47190
## [265] 11881 27570 11320  6013  4410  4704 38353  6624  5811  6766  5995 37411
## [277] 18566  9896 10117  4452  4590  5659  6175  6887 14639 17492  4724  8597
## [289]  6077  8350 17335 15866  9687 14854  8634 14854  6074  8513 37328  6739
## [301]  5718  4770 11708  4336  4650  4845  5502 11741  4887  4767  7591 37993
## [313] 33458  4379  4580 37993 39648  4311  6180 23598 37919  6747 32416  6696
## [325] 38053 11184 35342 37601  9056 11835 11490 39491  6945  4312  7302  6997
## [337]  4747 14854 23234  5648  4732  4575 10218 11579 12661 33087  6000  5527
## [349]  4350 26944 14715  4552  6662 11999 31617  4665  9438  4602  5850  4385
## [361] 13153  5830  4688 10980  5774 13479 17586  7165  5382  5042 12915 14855
## [373] 42990 37994 32519 10565  5537  4649  6847  5334 16183  7666 14919  5844
## [385]  4614  4772

ggplot2: ggplot2 is a powerful package for creating visualizations in R. We can use it to create various types of plots like scatterplots, boxplots, histograms, and more to explore the relationships and distributions of variables in the dataset. For example, We use ggplot2 to create a scatterplot of two variables with the following code:

library(ggplot2)
ggplot(df, aes(x = i1_rcs_p , y = i1_rcs_e), colour = 'red',
    width = 0.4) + 
  geom_point()

dplyr and tidyr: dplyr and tidyr are packages for manipulating and cleaning data in R. We can use them to filter, group, summarize, and reshape the data to extract meaningful insights. For example, you could use dplyr and tidyr to calculate summary statistics like mean, median, and standard deviation for a specific variable with the following code:

library(dplyr)

## Warning: package 'dplyr' was built under R version 4.2.3

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)
df %>% 
  summarize(mean = mean(i1_rcs_p),
            median = median(i1_rcs_p),
            sd = sd(i1_rcs_p))

stats: stats is a built-in package in R that contains various statistical functions and tests. We can use it to perform hypothesis testing, calculate p-values, and more. For example, we could use stats to perform a t-test between two groups of data with the following code:

#t.test(i1_rcs_p ~ i1_rcs_e, data = df, paired = TRUE)

finding distubtion between plabbed duration and effective duration

df_new$i1_rcs_pe <- df$i1_rcs_p-df$i1_rcs_e
df_new$i1_dep_1_pe <- df$i1_dep_1_p-df$i1_dep_1_e
df_new$i1_rcf_1_pe <- df$i1_rcf_1_p-df$i1_rcf_1_e
df_new$i1_dlv_pe <- df$i1_dlv_p-df$i1_dlv_e
df_new$o_rcs_pe <- df$o_rcs_p-df$o_rcs_e
df_new$o_dep_1_pe <- df$o_dep_1_p-df$o_dep_1_e
df_new$o_rcf_1_pe <- df$o_rcf_1_p-df$o_rcf_1_e
df_new$o_dlv_pe <- df$o_dlv_p-df$o_dlv_e
dim(df_new);head(df_new)

## [1] 3942   27

Correlation between new disrupted variable

cor_matrix <- cor(df_new[20:27])
cor_matrix

##                i1_rcs_pe  i1_dep_1_pe   i1_rcf_1_pe    i1_dlv_pe     o_rcs_pe
## i1_rcs_pe    1.000000000 -0.931206906 -0.0054907363 -0.019022511  0.014507883
## i1_dep_1_pe -0.931206906  1.000000000 -0.0308656464  0.027337464 -0.021876622
## i1_rcf_1_pe -0.005490736 -0.030865646  1.0000000000  0.001988295 -0.050860947
## i1_dlv_pe   -0.019022511  0.027337464  0.0019882952  1.000000000 -0.008196962
## o_rcs_pe     0.014507883 -0.021876622 -0.0508609473 -0.008196962  1.000000000
## o_dep_1_pe  -0.015593720  0.018061415  0.0530139448  0.015381367 -0.923032723
## o_rcf_1_pe   0.039677539 -0.047682815  0.0137866734  0.054912037  0.045102564
## o_dlv_pe    -0.006770061  0.009598674  0.0001278906  0.005106971  0.007111367
##               o_dep_1_pe  o_rcf_1_pe      o_dlv_pe
## i1_rcs_pe   -0.015593720  0.03967754 -0.0067700605
## i1_dep_1_pe  0.018061415 -0.04768282  0.0095986744
## i1_rcf_1_pe  0.053013945  0.01378667  0.0001278906
## i1_dlv_pe    0.015381367  0.05491204  0.0051069706
## o_rcs_pe    -0.923032723  0.04510256  0.0071113666
## o_dep_1_pe   1.000000000 -0.06365526 -0.0044848221
## o_rcf_1_pe  -0.063655265  1.00000000 -0.0333829944
## o_dlv_pe    -0.004484822 -0.03338299  1.0000000000

corrplot(cor_matrix, method=c("ellipse"),type = "lower", title = "Female's Correlated Data", addCoef.col = "black", cex.main=0.5)

eig <- eigen(cor_matrix)
eig

## eigen() decomposition
## $values
## [1] 1.97443005 1.89722087 1.06218728 1.00376446 0.99411374 0.92356317 0.07740876
## [8] 0.06731167
## 
## $vectors
##              [,1]        [,2]        [,3]         [,4]         [,5]        [,6]
## [1,]  0.501815133  0.49483081  0.02764096  0.021447940 -0.041255808  0.03844461
## [2,] -0.504179085 -0.49358737 -0.01925220 -0.017776094  0.000863130 -0.03564541
## [3,] -0.037868933  0.07195946 -0.19886890  0.086091376  0.955973310  0.17823000
## [4,] -0.030415887 -0.01297102 -0.61516301  0.467408380 -0.276478249  0.57051315
## [5,]  0.490724834 -0.50245227  0.02470521  0.010121058  0.050640511  0.06558617
## [6,] -0.491121724  0.50332581 -0.01570473 -0.003076132 -0.050302218 -0.04561641
## [7,]  0.097861507 -0.01197810 -0.69183318 -0.034141179 -0.003030366 -0.71429912
## [8,] -0.006128237 -0.01513055  0.31843496  0.878671815  0.053554302 -0.35127567
##               [,7]         [,8]
## [1,]  0.1834615202 -0.682101050
## [2,]  0.1819033307 -0.683467745
## [3,]  0.0090788171 -0.026341764
## [4,]  0.0044731153  0.008062990
## [5,] -0.6818744614 -0.184930477
## [6,] -0.6841308957 -0.180444534
## [7,] -0.0127258070 -0.009301505
## [8,]  0.0008975133  0.002251604

####################################### PCA
#perfoming PCA for both data
df_pca <- princomp(cor_matrix)
df_pca

## Call:
## princomp(x = cor_matrix)
## 
## Standard deviations:
##     Comp.1     Comp.2     Comp.3     Comp.4     Comp.5     Comp.6     Comp.7 
## 0.69805064 0.67072851 0.36871906 0.35211344 0.32748945 0.25757418 0.02653907 
##     Comp.8 
## 0.00000000 
## 
##  8  variables and  8 observations.

summary(df_pca)

## Importance of components:
##                           Comp.1    Comp.2     Comp.3     Comp.4    Comp.5
## Standard deviation     0.6980506 0.6707285 0.36871906 0.35211344 0.3274894
## Proportion of Variance 0.3553152 0.3280450 0.09913593 0.09040764 0.0782050
## Cumulative Proportion  0.3553152 0.6833602 0.78249615 0.87290379 0.9511088
##                            Comp.6       Comp.7 Comp.8
## Standard deviation     0.25757418 0.0265390716      0
## Proportion of Variance 0.04837763 0.0005135839      0
## Cumulative Proportion  0.99948642 1.0000000000      1

find the negative values in i1_rcs_pe

neg_i1_rcs_pe <- df_new$i1_rcs_pe[df_new$i1_rcs_pe<0]

# print the negative values

if (length(neg_i1_rcs_pe) > 0) {
  cat("Negative values found:", neg_i1_rcs_pe, "\n")
} else {
  cat("No negative values found\n")
}

## Negative values found: -19 -13 -488 -198 -24 -151 -87 -639 -60 -22 -153 -12 -108 -2 -212 -44 -100 -20 -99 -68 -36 -10 -600 -1 -7 -82 -24 -4 -27 -100 -20 -12 -90 -99 -1 -59 -36 -94 -18 -90 -20 -21 -28 -90 -29 -41 -32 -6 -1 -16 -3 -34 -47 -780 -10 -56 -64 -63 -11 -5 -32 -18 -22 -199 -54 -7 -4 -11 -80 -142 -19 -47 -126 -4 -82 -305 -40 -688 -46 -204 -77 -11 -38 -15 -8 -2 -41 -29 -74 -1200 -76 -13 -10 -28 -30 -109 -1 -65 -75 -7 -16 -8 -55 -19 -25 -23 -3 -2 -33 -11 -219 -419 -6 -4 -63 -15 -10 -41 -137 -8 -17 -20 -197 -12 -15 -10 -300 -70 -90 -50 -65 -656 -38 -8 -10 -22 -26 -85 -66 -2 -45 -65 -15 -15 -209 -424 -94 -93 -1 -230 -9 -10 -17 -4 -68 -42 -22 -6 -40 -41 -1 -25 -9 -54 -47 -9 -420 -316 -68 -53 -60 -43 -109 -227 -320 -179 -18 -14 -76 -26 -872 -60 -75 -25 -20 -4 -611 -18 -3 -47 -42 -6 -47 -128 -82 -17 -6 -11 -305 -90 -13 -5 -5 -30 -37 -41 -2 -91 -298 -45 -6 -15 -28 -11 -48 -1 -5 -67 -32 -51 -105 -402 -70 -889 -5 -19 -1 -1 -8 -25 -107 -620 -39 -56 -8 -16 -366 -6

print("total negative value:")

## [1] "total negative value:"

print(length(neg_i1_rcs_pe))

## [1] 238

library(nFactors)

## Warning: package 'nFactors' was built under R version 4.2.3

## Loading required package: lattice

## 
## Attaching package: 'nFactors'

## The following object is masked from 'package:lattice':
## 
##     parallel

################loading/presure to select component/variation of the component
load<-df_pca$loadings
str(load) #load er structure ta ki

##  'loadings' num [1:8, 1:8] 0.5013 -0.5038 -0.0394 -0.0321 0.4911 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:8] "i1_rcs_pe" "i1_dep_1_pe" "i1_rcf_1_pe" "i1_dlv_pe" ...
##   ..$ : chr [1:8] "Comp.1" "Comp.2" "Comp.3" "Comp.4" ...

load[,1]

##    i1_rcs_pe  i1_dep_1_pe  i1_rcf_1_pe    i1_dlv_pe     o_rcs_pe   o_dep_1_pe 
##  0.501342342 -0.503804494 -0.039410827 -0.032081104  0.491108387 -0.491661356 
##   o_rcf_1_pe     o_dlv_pe 
##  0.096328808 -0.007543362

sum(load[,1]^2) ##ss loading for component 1

## [1] 1

###########screeplot
fviz_eig(df_pca)

##########dim = 2<5 

fviz_pca_var(df_pca, col.var = "black")

fviz_cos2(df_pca,choice ="var",axes=1:2)