Part 3 Q2

Step 1 to load data

From the previous projects, we have cleaned up abnormal data. Since aircraft and aircraft_num are the same, I am eliminating the aircraft variable as well, with knowing airbus = 1, and boeing = 0. I also converted the aircraft_num and distance_Y to factor variable.

library(dplyr)
library(nnet)
library(ggplot2)
FAA <- read.csv("FAA_wrangled.csv", header = TRUE)

#create a multinomial variable and attach it to dataset
FAA <- FAA %>%
  mutate(distance_Y = case_when(
    distance < 1000 ~ 1,
    distance >= 1000 & distance < 2500 ~ 2,
    TRUE ~ 3
  ))

#discard variable distance
FAA <- FAA[, c(-1,-7)]
FAA$distance_Y <- as.factor(FAA$distance_Y)
FAA$aircraft_num <- as.factor(FAA$aircraft_num)
attach(FAA)
str(FAA)
'data.frame':   195 obs. of  8 variables:
 $ no_pasg     : int  41 44 47 48 48 48 49 49 49 51 ...
 $ speed_ground: num  97.6 99.6 92.9 101.8 109.3 ...
 $ speed_air   : num  97 99.2 95.8 103.6 109.6 ...
 $ height      : num  38.4 35.2 23.8 23 33.1 ...
 $ pitch       : num  3.53 3.84 3.91 4.94 4.04 ...
 $ duration    : num  123 139 169 157 200 ...
 $ aircraft_num: Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
 $ distance_Y  : Factor w/ 2 levels "2","3": 1 1 1 2 2 2 2 2 2 2 ...
summary(FAA)
    no_pasg       speed_ground      speed_air          height           pitch          duration    
 Min.   :41.00   Min.   : 88.69   Min.   : 90.00   Min.   : 9.697   Min.   :2.702   Min.   : 45.5  
 1st Qu.:56.00   1st Qu.: 95.28   1st Qu.: 96.15   1st Qu.:23.365   1st Qu.:3.636   1st Qu.:115.9  
 Median :60.00   Median :100.75   Median :100.89   Median :29.837   Median :4.070   Median :149.3  
 Mean   :59.83   Mean   :103.43   Mean   :103.50   Mean   :30.359   Mean   :4.043   Mean   :150.9  
 3rd Qu.:65.00   3rd Qu.:109.57   3rd Qu.:109.42   3rd Qu.:36.590   3rd Qu.:4.442   3rd Qu.:185.4  
 Max.   :80.00   Max.   :132.78   Max.   :132.91   Max.   :58.228   Max.   :5.311   Max.   :287.0  
 aircraft_num distance_Y
 0:118        2: 95     
 1: 77        3:100     
                        
                        
                        
                        

Q2. The number of passengers is often of interest of airlines. What distribution would you use to model this variable? Do we have any variables (in the FAA data set) that are useful for predicting the number of passengers on board?

Number of passenger is a count variable, so I am using Poisson distribution to model this variable.

#fit a GLM
mod <- glm(no_pasg ~ ., family = poisson, FAA)
summary(mod)

Call:
glm(formula = no_pasg ~ ., family = poisson, data = FAA)

Coefficients:
                Estimate Std. Error z value Pr(>|z|)    
(Intercept)    4.139e+00  1.782e-01  23.228   <2e-16 ***
speed_ground   7.798e-04  6.178e-03   0.126    0.900    
speed_air     -8.324e-04  6.310e-03  -0.132    0.895    
height        -5.875e-05  1.028e-03  -0.057    0.954    
pitch         -4.671e-03  1.800e-02  -0.260    0.795    
duration      -1.742e-04  1.961e-04  -0.889    0.374    
aircraft_num1  1.244e-02  2.111e-02   0.589    0.556    
distance_Y3    5.709e-04  2.971e-02   0.019    0.985    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 162.74  on 194  degrees of freedom
Residual deviance: 161.31  on 187  degrees of freedom
AIC: 1332.7

Number of Fisher Scoring iterations: 4
step(mod)
Start:  AIC=1332.69
no_pasg ~ speed_ground + speed_air + height + pitch + duration + 
    aircraft_num + distance_Y

               Df Deviance    AIC
- distance_Y    1   161.31 1330.7
- height        1   161.31 1330.7
- speed_ground  1   161.32 1330.7
- speed_air     1   161.32 1330.7
- pitch         1   161.38 1330.8
- aircraft_num  1   161.65 1331.0
- duration      1   162.10 1331.5
<none>              161.31 1332.7

Step:  AIC=1330.69
no_pasg ~ speed_ground + speed_air + height + pitch + duration + 
    aircraft_num

               Df Deviance    AIC
- height        1   161.31 1328.7
- speed_ground  1   161.32 1328.7
- speed_air     1   161.32 1328.7
- pitch         1   161.38 1328.8
- aircraft_num  1   161.67 1329.0
- duration      1   162.10 1329.5
<none>              161.31 1330.7

Step:  AIC=1328.69
no_pasg ~ speed_ground + speed_air + pitch + duration + aircraft_num

               Df Deviance    AIC
- speed_ground  1   161.33 1326.7
- speed_air     1   161.33 1326.7
- pitch         1   161.38 1326.8
- aircraft_num  1   161.67 1327.0
- duration      1   162.11 1327.5
<none>              161.31 1328.7

Step:  AIC=1326.71
no_pasg ~ speed_air + pitch + duration + aircraft_num

               Df Deviance    AIC
- speed_air     1   161.33 1324.7
- pitch         1   161.40 1324.8
- aircraft_num  1   161.69 1325.1
- duration      1   162.18 1325.6
<none>              161.33 1326.7

Step:  AIC=1324.71
no_pasg ~ pitch + duration + aircraft_num

               Df Deviance    AIC
- pitch         1   161.40 1322.8
- aircraft_num  1   161.69 1323.1
- duration      1   162.18 1323.6
<none>              161.33 1324.7

Step:  AIC=1322.78
no_pasg ~ duration + aircraft_num

               Df Deviance    AIC
- aircraft_num  1   161.97 1321.3
- duration      1   162.24 1321.6
<none>              161.40 1322.8

Step:  AIC=1321.35
no_pasg ~ duration

           Df Deviance    AIC
- duration  1   162.74 1320.1
<none>          161.97 1321.3

Step:  AIC=1320.12
no_pasg ~ 1


Call:  glm(formula = no_pasg ~ 1, family = poisson, data = FAA)

Coefficients:
(Intercept)  
      4.091  

Degrees of Freedom: 194 Total (i.e. Null);  194 Residual
Null Deviance:      162.7 
Residual Deviance: 162.7    AIC: 1320
drop1(mod, test = "LRT")
Single term deletions

Model:
no_pasg ~ speed_ground + speed_air + height + pitch + duration + 
    aircraft_num + distance_Y
             Df Deviance    AIC     LRT Pr(>Chi)
<none>            161.31 1332.7                 
speed_ground  1   161.32 1330.7 0.01593   0.8996
speed_air     1   161.32 1330.7 0.01740   0.8951
height        1   161.31 1330.7 0.00327   0.9544
pitch         1   161.38 1330.8 0.06734   0.7953
duration      1   162.10 1331.5 0.79052   0.3739
aircraft_num  1   161.65 1331.0 0.34706   0.5558
distance_Y    1   161.31 1330.7 0.00037   0.9847
#check the correlation
round(cor(FAA[,1:6]), 2)
             no_pasg speed_ground speed_air height pitch duration
no_pasg         1.00         0.00      0.00  -0.01 -0.04    -0.07
speed_ground    0.00         1.00      0.99  -0.10 -0.06     0.02
speed_air       0.00         0.99      1.00  -0.09 -0.05     0.04
height         -0.01        -0.10     -0.09   1.00 -0.03     0.07
pitch          -0.04        -0.06     -0.05  -0.03  1.00    -0.06
duration       -0.07         0.02      0.04   0.07 -0.06     1.00

I built a model called “mod” and from the summary statistics, we can see that there is no variable that’s significant enough to impact the number of passengers variable. From the step method, I got the null model.

I also decided to check the correlation, and saw that only speed_ground and speed_air are highly correlated. We have learned to remove the speed_ground from the previous projects. This is not much of a help, as in the summary statistics, non of the variables showed significance.

In the next code, I tried to add the dispersion parameter and see if that could provide better result.

#take a look at the observed data and predicted data using the model
mod$y
  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20  21  22  23  24  25  26  27 
 41  44  47  48  48  48  49  49  49  51  52  52  53  53  54  54  54  55  55  55  56  56  56  56  57  58  58 
 28  29  30  31  32  33  34  35  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54 
 58  58  59  60  60  60  60  60  61  61  61  62  62  62  62  62  62  62  63  63  63  63  63  64  64  65  65 
 55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80  81 
 65  65  65  65  66  66  66  66  66  66  67  67  67  68  68  68  69  70  71  72  73  75  80  43  44  46  46 
 82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107 108 
 46  46  46  47  48  48  49  49  52  52  52  52  52  52  53  53  53  53  54  54  54  55  55  55  56  56  56 
109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 
 56  57  57  57  57  57  57  58  58  58  58  58  58  58  59  59  59  59  59  59  59  59  59  59  59  60  60 
136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 
 60  60  60  60  60  60  60  60  60  61  61  61  61  61  61  61  61  61  61  61  62  62  62  63  63  63  63 
163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 
 63  63  64  64  64  64  64  64  65  65  65  65  66  66  66  66  67  67  67  68  68  68  69  69  69  69  70 
190 191 192 193 194 195 
 70  70  72  72  75  79 
round(predict(mod, type = "response"),1)
   1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16   17   18   19   20   21   22 
60.7 60.5 60.0 59.9 59.7 60.3 60.3 60.8 59.9 60.5 60.7 60.5 60.0 60.4 60.5 59.7 60.3 59.1 60.0 59.8 61.3 59.7 
  23   24   25   26   27   28   29   30   31   32   33   34   35   36   37   38   39   40   41   42   43   44 
60.6 60.5 59.5 60.0 60.8 61.7 60.5 59.8 60.5 60.2 60.8 60.2 60.5 60.5 59.9 59.7 60.3 60.8 60.3 60.9 60.5 60.8 
  45   46   47   48   49   50   51   52   53   54   55   56   57   58   59   60   61   62   63   64   65   66 
60.4 61.0 60.6 60.3 59.9 60.8 60.7 60.5 60.2 59.8 61.3 60.7 60.8 59.7 59.5 59.7 59.5 61.3 60.7 60.6 60.0 60.4 
  67   68   69   70   71   72   73   74   75   76   77   78   79   80   81   82   83   84   85   86   87   88 
60.4 60.3 60.1 59.4 60.0 59.1 61.3 60.2 59.8 60.8 60.3 59.5 60.5 58.9 58.4 59.2 59.4 59.5 59.6 59.4 59.6 60.0 
  89   90   91   92   93   94   95   96   97   98   99  100  101  102  103  104  105  106  107  108  109  110 
59.7 60.1 59.7 58.9 59.7 58.0 60.4 57.9 59.9 59.1 60.0 58.8 60.5 58.4 60.3 60.2 59.4 60.1 59.2 59.9 58.7 59.7 
 111  112  113  114  115  116  117  118  119  120  121  122  123  124  125  126  127  128  129  130  131  132 
60.1 59.7 59.7 59.9 59.8 58.9 59.9 58.7 59.8 59.6 59.0 58.6 58.7 59.8 58.7 59.2 58.9 60.0 58.7 59.6 59.6 59.5 
 133  134  135  136  137  138  139  140  141  142  143  144  145  146  147  148  149  150  151  152  153  154 
58.2 58.9 60.3 60.1 59.5 59.1 60.2 59.7 59.7 59.8 59.6 60.0 60.0 59.0 59.7 58.9 59.3 59.3 59.9 59.7 59.5 60.1 
 155  156  157  158  159  160  161  162  163  164  165  166  167  168  169  170  171  172  173  174  175  176 
59.4 59.2 60.4 59.1 59.1 59.3 59.1 59.8 59.0 59.5 60.0 60.0 59.9 58.6 60.0 59.9 59.2 58.7 58.9 59.4 59.9 59.4 
 177  178  179  180  181  182  183  184  185  186  187  188  189  190  191  192  193  194  195 
59.4 60.2 60.1 60.0 59.2 59.7 59.9 59.2 59.3 60.1 59.7 58.9 60.1 60.1 60.1 59.9 59.4 60.1 59.5 
gof<-sum(residuals(mod,type="pearson")^2)
#add dispersion parameter
dp <- gof/mod$df.res
summary(mod, dispersion = dp)

Call:
glm(formula = no_pasg ~ ., family = poisson, data = FAA)

Coefficients:
                Estimate Std. Error z value Pr(>|z|)    
(Intercept)    4.139e+00  1.643e-01  25.191   <2e-16 ***
speed_ground   7.798e-04  5.696e-03   0.137    0.891    
speed_air     -8.324e-04  5.818e-03  -0.143    0.886    
height        -5.875e-05  9.480e-04  -0.062    0.951    
pitch         -4.671e-03  1.660e-02  -0.281    0.778    
duration      -1.742e-04  1.808e-04  -0.964    0.335    
aircraft_num1  1.244e-02  1.946e-02   0.639    0.523    
distance_Y3    5.709e-04  2.740e-02   0.021    0.983    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for poisson family taken to be 0.8502401)

    Null deviance: 162.74  on 194  degrees of freedom
Residual deviance: 161.31  on 187  degrees of freedom
AIC: 1332.7

Number of Fisher Scoring iterations: 4

From the new summary statistics with dispersion parameter, the P value for all variables other than the intercept is still showing non significant. Therefore, my conclusion is that there is no useful variables in the cleaned FAA dataset to predict the number of passengers on board.

LS0tCnRpdGxlOiAiQkFOQSA3MDQyIE1vZHVsZSA2IFByb2plY3QgUGFydCAzIFEyIgpvdXRwdXQ6CiAgaHRtbF9ub3RlYm9vazogZGVmYXVsdAogIHdvcmRfZG9jdW1lbnQ6IGRlZmF1bHQKLS0tCgojIyBQYXJ0IDMgUTIKCiMjIyMjIFN0ZXAgMSB0byBsb2FkIGRhdGEKCkZyb20gdGhlIHByZXZpb3VzIHByb2plY3RzLCB3ZSBoYXZlIGNsZWFuZWQgdXAgYWJub3JtYWwgZGF0YS4gU2luY2UgYWlyY3JhZnQgYW5kIGFpcmNyYWZ0X251bSBhcmUgdGhlIHNhbWUsIEkgYW0gZWxpbWluYXRpbmcgdGhlIGFpcmNyYWZ0IHZhcmlhYmxlIGFzIHdlbGwsIHdpdGgga25vd2luZyBhaXJidXMgPSAxLCBhbmQgYm9laW5nID0gMC4gSSBhbHNvIGNvbnZlcnRlZCB0aGUgYWlyY3JhZnRfbnVtIGFuZCBkaXN0YW5jZV9ZIHRvIGZhY3RvciB2YXJpYWJsZS4KCmBgYHtyIG1lc3NhZ2U9RkFMU0UsIHdhcm5pbmc9RkFMU0V9CmxpYnJhcnkoZHBseXIpCmxpYnJhcnkobm5ldCkKbGlicmFyeShnZ3Bsb3QyKQpGQUEgPC0gcmVhZC5jc3YoIkZBQV93cmFuZ2xlZC5jc3YiLCBoZWFkZXIgPSBUUlVFKQoKI2NyZWF0ZSBhIG11bHRpbm9taWFsIHZhcmlhYmxlIGFuZCBhdHRhY2ggaXQgdG8gZGF0YXNldApGQUEgPC0gRkFBICU+JQogIG11dGF0ZShkaXN0YW5jZV9ZID0gY2FzZV93aGVuKAogICAgZGlzdGFuY2UgPCAxMDAwIH4gMSwKICAgIGRpc3RhbmNlID49IDEwMDAgJiBkaXN0YW5jZSA8IDI1MDAgfiAyLAogICAgVFJVRSB+IDMKICApKQoKI2Rpc2NhcmQgdmFyaWFibGUgZGlzdGFuY2UKRkFBIDwtIEZBQVssIGMoLTEsLTcpXQpGQUEkZGlzdGFuY2VfWSA8LSBhcy5mYWN0b3IoRkFBJGRpc3RhbmNlX1kpCkZBQSRhaXJjcmFmdF9udW0gPC0gYXMuZmFjdG9yKEZBQSRhaXJjcmFmdF9udW0pCmF0dGFjaChGQUEpCnN0cihGQUEpCnN1bW1hcnkoRkFBKQpgYGAKCiMjIyBRMi4gVGhlIG51bWJlciBvZiBwYXNzZW5nZXJzIGlzIG9mdGVuIG9mIGludGVyZXN0IG9mIGFpcmxpbmVzLiBXaGF0IGRpc3RyaWJ1dGlvbiB3b3VsZCB5b3UgdXNlIHRvIG1vZGVsIHRoaXMgdmFyaWFibGU/IERvIHdlIGhhdmUgYW55IHZhcmlhYmxlcyAoaW4gdGhlIEZBQSBkYXRhIHNldCkgdGhhdCBhcmUgdXNlZnVsIGZvciBwcmVkaWN0aW5nIHRoZSBudW1iZXIgb2YgcGFzc2VuZ2VycyBvbiBib2FyZD8KCk51bWJlciBvZiBwYXNzZW5nZXIgaXMgYSBjb3VudCB2YXJpYWJsZSwgc28gSSBhbSB1c2luZyBQb2lzc29uIGRpc3RyaWJ1dGlvbiB0byBtb2RlbCB0aGlzIHZhcmlhYmxlLgoKYGBge3J9CiNmaXQgYSBHTE0KbW9kIDwtIGdsbShub19wYXNnIH4gLiwgZmFtaWx5ID0gcG9pc3NvbiwgRkFBKQpzdW1tYXJ5KG1vZCkKc3RlcChtb2QpCmRyb3AxKG1vZCwgdGVzdCA9ICJMUlQiKQoKI2NoZWNrIHRoZSBjb3JyZWxhdGlvbgpyb3VuZChjb3IoRkFBWywxOjZdKSwgMikKYGBgCgpJIGJ1aWx0IGEgbW9kZWwgY2FsbGVkICJtb2QiIGFuZCBmcm9tIHRoZSBzdW1tYXJ5IHN0YXRpc3RpY3MsIHdlIGNhbiBzZWUgdGhhdCB0aGVyZSBpcyBubyB2YXJpYWJsZSB0aGF0J3Mgc2lnbmlmaWNhbnQgZW5vdWdoIHRvIGltcGFjdCB0aGUgbnVtYmVyIG9mIHBhc3NlbmdlcnMgdmFyaWFibGUuIEZyb20gdGhlIHN0ZXAgbWV0aG9kLCBJIGdvdCB0aGUgbnVsbCBtb2RlbC4KCkkgYWxzbyBkZWNpZGVkIHRvIGNoZWNrIHRoZSBjb3JyZWxhdGlvbiwgYW5kIHNhdyB0aGF0IG9ubHkgc3BlZWRfZ3JvdW5kIGFuZCBzcGVlZF9haXIgYXJlIGhpZ2hseSBjb3JyZWxhdGVkLiBXZSBoYXZlIGxlYXJuZWQgdG8gcmVtb3ZlIHRoZSBzcGVlZF9ncm91bmQgZnJvbSB0aGUgcHJldmlvdXMgcHJvamVjdHMuIFRoaXMgaXMgbm90IG11Y2ggb2YgYSBoZWxwLCBhcyBpbiB0aGUgc3VtbWFyeSBzdGF0aXN0aWNzLCBub24gb2YgdGhlIHZhcmlhYmxlcyBzaG93ZWQgc2lnbmlmaWNhbmNlLgoKSW4gdGhlIG5leHQgY29kZSwgSSB0cmllZCB0byBhZGQgdGhlIGRpc3BlcnNpb24gcGFyYW1ldGVyIGFuZCBzZWUgaWYgdGhhdCBjb3VsZCBwcm92aWRlIGJldHRlciByZXN1bHQuCgpgYGB7cn0KI3Rha2UgYSBsb29rIGF0IHRoZSBvYnNlcnZlZCBkYXRhIGFuZCBwcmVkaWN0ZWQgZGF0YSB1c2luZyB0aGUgbW9kZWwKbW9kJHkKcm91bmQocHJlZGljdChtb2QsIHR5cGUgPSAicmVzcG9uc2UiKSwxKQpnb2Y8LXN1bShyZXNpZHVhbHMobW9kLHR5cGU9InBlYXJzb24iKV4yKQojYWRkIGRpc3BlcnNpb24gcGFyYW1ldGVyCmRwIDwtIGdvZi9tb2QkZGYucmVzCnN1bW1hcnkobW9kLCBkaXNwZXJzaW9uID0gZHApCmBgYAoKRnJvbSB0aGUgbmV3IHN1bW1hcnkgc3RhdGlzdGljcyB3aXRoIGRpc3BlcnNpb24gcGFyYW1ldGVyLCB0aGUgUCB2YWx1ZSBmb3IgYWxsIHZhcmlhYmxlcyBvdGhlciB0aGFuIHRoZSBpbnRlcmNlcHQgaXMgc3RpbGwgc2hvd2luZyBub24gc2lnbmlmaWNhbnQuIFRoZXJlZm9yZSwgbXkgY29uY2x1c2lvbiBpcyB0aGF0IHRoZXJlIGlzIG5vIHVzZWZ1bCB2YXJpYWJsZXMgaW4gdGhlIGNsZWFuZWQgRkFBIGRhdGFzZXQgdG8gcHJlZGljdCB0aGUgbnVtYmVyIG9mIHBhc3NlbmdlcnMgb24gYm9hcmQuCg==