rm(list=ls())
library(rio)

trips = import("6304 Regression Project Data.csv")
colnames(trips) = tolower(make.names(colnames(trips)))

set.seed(71088966)
some.trips = trips[sample(1:nrow(trips), 100, replace = FALSE),]
attach(some.trips)

#View the structure of the dataset
str(some.trips)
## 'data.frame':    100 obs. of  9 variables:
##  $ taxi_id     : int  197 1615 6969 2271 5414 6785 7131 1490 1015 7746 ...
##  $ trip_seconds: int  180 240 600 780 180 120 420 420 120 1080 ...
##  $ trip_miles  : num  0 0 0.2 2.6 0 0.4 1.4 0 0 0.6 ...
##  $ fare        : num  5 5.25 10.45 9.75 4.75 ...
##  $ tips        : num  0 0 0 0 1 2 0 0 0 0 ...
##  $ tolls       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ extras      : num  0 1 0 1 0 0 0 1.5 0 0 ...
##  $ trip_total  : num  5 6.25 10.45 10.75 5.75 ...
##  $ payment_type: chr  "Cash" "Credit Card" "Cash" "Cash" ...
#View plots to identify aberrations between trip_seconds/trip_miles and fare
plot(trip_seconds, fare)
abline(0, 0.01, pch = 19, col = "red")

plot(trip_miles, fare)
abline(0, 2, pch = 19, col = "red")

#Removing aberrant cases
cleansed_some.trip = subset(some.trips, trip_seconds != 0 & trip_miles != 0)
attach(cleansed_some.trip)
## The following objects are masked from some.trips:
## 
##     extras, fare, payment_type, taxi_id, tips, tolls, trip_miles,
##     trip_seconds, trip_total

Discussions and Interpretations:

#Summaries and density plots of the continuous variables
#Trip Seconds
summary(trip_seconds)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   120.0   360.0   600.0   774.7   900.0  3240.0
plot(density(trip_seconds), pch = 19, lwd = 3, main = "Trip Seconds Plot")

#Trip Miles
summary(trip_miles)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.100   0.675   1.600   3.916   4.925  30.300
plot(density(trip_miles), pch = 19, lwd = 3, main = "Trip Miles Plot")

#Trip Fare
summary(fare)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   4.250   6.688   9.250  14.604  16.375  71.250
plot(density(fare), pch = 19, lwd = 3, main = "Trip Fare Plot")

#Trip Tips
summary(tips)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   0.000   1.393   2.000  10.600
plot(density(tips), pch = 19, lwd = 3, main = "Trip Tip Plot")

#Trip Extras
summary(extras)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.6103  1.0000  4.5000
plot(density(extras), pch = 19, lwd = 3, main = "Trip Extra Plot")

#Trip Total
summary(trip_total)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   4.450   7.438  10.250  16.673  19.360  79.750
plot(density(trip_total), pch = 19, lwd = 3, main = "Trip Total Plot")

#Determining the number of cases in each payment type
split(cleansed_some.trip, payment_type)
## $Cash
##         taxi_id trip_seconds trip_miles  fare tips tolls extras trip_total
## 1323639    6969          600        0.2 10.45    0   0.0    0.0      10.45
## 715370     2271          780        2.6  9.75    0   0.0    1.0      10.75
## 1564888    7131          420        1.4  6.75    0   0.0    0.0       6.75
## 104870     7746         1080        0.6 27.00    0   0.0    0.0      27.00
## 287685     3671         1140        7.6 22.00    0   0.0    0.0      22.00
## 130446     2059         1080        3.6 13.25    0   0.0    0.0      13.25
## 422577     2918          720        0.2 13.50    0   0.0    0.0      13.50
## 404357      904         1860       30.3 71.25    0   4.5    4.0      79.75
## 1422473    3013          600        2.8 10.50    0   0.0    0.0      10.50
## 1561979    2750          600        0.1 10.00    0   0.0    0.0      10.00
## 1374344    4770          660        4.9 15.00    0   0.0    0.0      15.00
## 793977     1025          240        0.6  4.75    0   0.0    0.0       4.75
## 147579     5545          300        1.0  6.25    0   0.0    0.0       6.25
## 1028935    7271          840        2.0 10.00    0   0.0    0.0      10.00
## 698115     8067          240        0.8  5.25    0   0.0    0.0       5.25
## 1108944      97         2040        0.9 37.75    0   0.0    2.0      39.75
## 665641     2618          780        5.0 15.25    0   0.0    1.0      16.25
## 220233     8422          180        0.5  4.50    0   0.0    0.0       4.50
## 1445043    7065          540        3.5 11.75    0   0.0    0.0      11.75
## 1581512    5699         1440       18.0 44.50    0   0.0    1.0      45.50
## 1478674    1561          360        1.1  6.25    0   0.0    2.0       8.25
## 666919     5174          660        0.1  8.50    0   0.0    0.0       8.50
## 303642     6425          300        1.2  6.25    0   0.0    1.0       7.25
## 731782     3601          660        0.1 11.00    0   0.0    0.0      11.00
## 1644228    7478         1740        7.2 36.75    0   0.0    4.5      41.25
## 784234     5900          360        0.7  5.75    0   0.0    0.0       5.75
## 8524       4126          360        0.9  6.00    0   0.0    0.0       6.00
## 1185092    1282          120        0.7  4.45    0   0.0    2.0       6.45
## 862482     1654          300        1.1  6.00    0   0.0    1.0       7.00
## 1542242    7065          780        2.2  8.85    0   0.0    1.0       9.85
## 1236687     890          360        0.6  4.45    0   0.0    0.0       4.45
## 1100139    4605          600        3.0 10.75    0   0.0    0.0      10.75
## 932180     3760          480        1.5  7.50    0   0.0    0.0       7.50
## 1661558    5430          780        7.4 20.75    0   0.0    3.0      23.75
## 817955     4010          120        0.5  4.50    0   0.0    0.0       4.50
## 145196     2640          900        0.1  9.50    0   0.0    0.0       9.50
## 140349     2655          720        0.5  7.25    0   0.0    0.0       7.25
## 444334     2214          540        2.5  9.75    0   0.0    1.0      10.75
## 496366     7578          600        1.7  8.50    0   0.0    0.0       8.50
## 759098     4041          600        2.1  8.75    0   0.0    0.0       8.75
## 1428452     900          300        1.0  5.85    0   0.0    1.0       6.85
## 700432     2953          360        0.1  7.25    0   0.0    1.0       8.25
## 83654      4209         1440        6.1 19.25    0   0.0    0.0      19.25
## 504561     2062         1260       10.5 23.85    0   0.0    1.0      24.85
##         payment_type
## 1323639         Cash
## 715370          Cash
## 1564888         Cash
## 104870          Cash
## 287685          Cash
## 130446          Cash
## 422577          Cash
## 404357          Cash
## 1422473         Cash
## 1561979         Cash
## 1374344         Cash
## 793977          Cash
## 147579          Cash
## 1028935         Cash
## 698115          Cash
## 1108944         Cash
## 665641          Cash
## 220233          Cash
## 1445043         Cash
## 1581512         Cash
## 1478674         Cash
## 666919          Cash
## 303642          Cash
## 731782          Cash
## 1644228         Cash
## 784234          Cash
## 8524            Cash
## 1185092         Cash
## 862482          Cash
## 1542242         Cash
## 1236687         Cash
## 1100139         Cash
## 932180          Cash
## 1661558         Cash
## 817955          Cash
## 145196          Cash
## 140349          Cash
## 444334          Cash
## 496366          Cash
## 759098          Cash
## 1428452         Cash
## 700432          Cash
## 83654           Cash
## 504561          Cash
## 
## $`Credit Card`
##         taxi_id trip_seconds trip_miles  fare  tips tolls extras
## 138578     6785          120       0.40  4.25  2.00     0      0
## 1587751    2090          900       6.00 18.25  3.65     0      0
## 853491     1107          480       1.40  7.25  1.00     0      0
## 1049599    6064          480       0.12  7.00  4.00     0      0
## 1423020    8687          180       0.90  5.25  1.75     0      0
## 583635     7548          780       2.30 10.25  2.00     0      0
## 1262545    2893          480       1.10  7.25  3.00     0      0
## 1385697      59         1860      17.80 44.75  8.95     0      0
## 1277064    1567          540       2.04  8.25  2.00     0      0
## 644690      579          480       1.80  8.00  2.00     0      0
## 1621811    6993          540       2.40  7.65  2.00     0      0
## 1649766    2048         1740      15.60 39.50 10.00     0      4
## 960012     7264         1200       6.20 18.50  4.87     0      1
## 217295     5884          600       2.80 10.75  2.00     0      0
## 554235     4694         1860      17.40 43.25  8.85     0      1
## 851982     6301         2100      12.40 34.00  7.60     0      4
## 882695     1539          360       0.10  7.25  3.00     0      0
## 144383     2987         1380       8.00 22.50  3.50     0      0
## 1403049    2510          180       0.60  4.75  2.00     0      0
## 159715     6961          720       1.90  9.00  2.00     0      0
## 731034     6824          420       1.20  6.50  2.00     0      0
## 917619     1763          420       1.40  7.00  2.00     0      0
## 139529     5338         3240      17.80 49.00 10.60     0      4
## 1133092    4261          780       5.10 15.75  3.94     0      0
##         trip_total payment_type
## 138578        6.25  Credit Card
## 1587751      21.90  Credit Card
## 853491        8.25  Credit Card
## 1049599      11.00  Credit Card
## 1423020       7.00  Credit Card
## 583635       12.25  Credit Card
## 1262545      10.25  Credit Card
## 1385697      53.70  Credit Card
## 1277064      10.25  Credit Card
## 644690       10.00  Credit Card
## 1621811       9.65  Credit Card
## 1649766      53.50  Credit Card
## 960012       24.37  Credit Card
## 217295       12.75  Credit Card
## 554235       53.10  Credit Card
## 851982       45.60  Credit Card
## 882695       10.25  Credit Card
## 144383       26.00  Credit Card
## 1403049       6.75  Credit Card
## 159715       11.00  Credit Card
## 731034        8.50  Credit Card
## 917619        9.00  Credit Card
## 139529       63.60  Credit Card
## 1133092      19.69  Credit Card
#Correlation analysis of continuous variables
continuous_var_some.trip = subset(cleansed_some.trip, select =  c("trip_seconds", "trip_miles", "fare", "tips", "tolls", "extras", "trip_total"))

#Creating correlation matrix
library(corrplot)
## corrplot 0.84 loaded
xx = cor(continuous_var_some.trip)
xx
##              trip_seconds trip_miles      fare        tips       tolls
## trip_seconds    1.0000000  0.7730028 0.8884842  0.58984374  0.22797164
## trip_miles      0.7730028  1.0000000 0.9195883  0.53326244  0.56894630
## fare            0.8884842  0.9195883 1.0000000  0.48437961  0.52150930
## tips            0.5898437  0.5332624 0.4843796  1.00000000 -0.06692989
## tolls           0.2279716  0.5689463 0.5215093 -0.06692989  1.00000000
## extras          0.6245751  0.6021173 0.6600784  0.34188625  0.35617831
## trip_total      0.8996058  0.9268045 0.9894412  0.59369275  0.49029068
##                 extras trip_total
## trip_seconds 0.6245751  0.8996058
## trip_miles   0.6021173  0.9268045
## fare         0.6600784  0.9894412
## tips         0.3418862  0.5936928
## tolls        0.3561783  0.4902907
## extras       1.0000000  0.6988474
## trip_total   0.6988474  1.0000000
corrplot(xx,method="circle", type = "lower")

corrplot(xx,method="number", type = "lower")

#Correlation matrix with p values.
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
## 
##     format.pval, units

xx=rcorr(as.matrix(continuous_var_some.trip))
xx
##              trip_seconds trip_miles fare  tips tolls extras trip_total
## trip_seconds         1.00       0.77 0.89  0.59  0.23   0.62       0.90
## trip_miles           0.77       1.00 0.92  0.53  0.57   0.60       0.93
## fare                 0.89       0.92 1.00  0.48  0.52   0.66       0.99
## tips                 0.59       0.53 0.48  1.00 -0.07   0.34       0.59
## tolls                0.23       0.57 0.52 -0.07  1.00   0.36       0.49
## extras               0.62       0.60 0.66  0.34  0.36   1.00       0.70
## trip_total           0.90       0.93 0.99  0.59  0.49   0.70       1.00
## 
## n= 68 
## 
## 
## P
##              trip_seconds trip_miles fare   tips   tolls  extras
## trip_seconds              0.0000     0.0000 0.0000 0.0615 0.0000
## trip_miles   0.0000                  0.0000 0.0000 0.0000 0.0000
## fare         0.0000       0.0000            0.0000 0.0000 0.0000
## tips         0.0000       0.0000     0.0000        0.5876 0.0043
## tolls        0.0615       0.0000     0.0000 0.5876        0.0029
## extras       0.0000       0.0000     0.0000 0.0043 0.0029       
## trip_total   0.0000       0.0000     0.0000 0.0000 0.0000 0.0000
##              trip_total
## trip_seconds 0.0000    
## trip_miles   0.0000    
## fare         0.0000    
## tips         0.0000    
## tolls        0.0000    
## extras       0.0000    
## trip_total

Discussions and Interpretations:

  1. You can see above the summaries and density plots of all the continuous variables, except taxi_id. As we can see from the density plots of the continuous variables, all these continuous variables are right skewed, but these variables differ in terms of the spread of their distribution. trip_seconds for example is distributed over 0 - 3000s with most of the data distributed around 0 - 1000s, whereas the trip_miles is distributed over 0 - 30 miles and most of the data distributed around 0 - 10 miles. fare and trip_total have a similar shape and spread ranging over 0 - 80, differing from tip and extras that spread over (-2 - 12) and (-1 - 5) respectively.

  2. As executed above, we can see the table of the number of cases in each level of the payment_type.

  3. As executed above, we can see the correlation matrix between the continuous variables and correlation matrix with the respective p-values. The above correlation matrix can lead to the below observations: 3.1 Correlation coefficients > +0.50, suggest a strong positive correlation between most of the independent continuous variables. The strongest correlation and their significance is described below. 3.1.1 fare and trip_total have the strongest positive correlation, with a correlation coeff, R = +0.99. Their correlation is significant as suggested by a p-value of 0. 3.1.2 trip_miles and trip_total have a strong and significant positive correlation as suggested by R = +0.93 and p = 0. 3.1.3 R = +0.92 and p = 0 suggest a strong and significant positive correlation 3.1.4 trip_seconds and trip_total also have a strong and significant positive correlation with R = +0.90 and p = 0. 3.2 trip_seconds and extras have a strong positive correlation with R = +0.62, but the correlation is not significant as p = 0.0615 > significance level of 5%. 3.3 trip_seconds and tolls have a positive but weak correlation with R = +0.23. 3.4 tolls and tips have a weak negative correlation with R = -0.07, and the correlation isn’t significant as suggested by p = 0.5876. 3.5 These observations can be interpreted by the above circle correlation plot, where the size and color of the circle suggest the correlation between two variables. For instance, the largest circle, with the darkest color is there in the cell representing the fare and trip_total, suggesting the strongest positive correlation. It is better illustrated by the number correlation plot, where the correlation coeff, R, is present in each cell for a pair of variables and suggests the correlation for each pair of independent variables.

#Creating a simple regression model
regout1 = lm(fare~trip_miles+trip_seconds+payment_type, data = cleansed_some.trip)
summary(regout1)
## 
## Call:
## lm(formula = fare ~ trip_miles + trip_seconds + payment_type, 
##     data = cleansed_some.trip)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.5143 -1.5784 -0.4741  0.8758 13.6171 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              2.050875   0.807649   2.539   0.0135 *  
## trip_miles               1.374831   0.122732  11.202  < 2e-16 ***
## trip_seconds             0.010218   0.001194   8.556 3.38e-12 ***
## payment_typeCredit Card -2.114804   0.937011  -2.257   0.0274 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.628 on 64 degrees of freedom
## Multiple R-squared:  0.9296, Adjusted R-squared:  0.9264 
## F-statistic: 281.9 on 3 and 64 DF,  p-value: < 2.2e-16
#Dump payment_type from my model
regout2 = lm(fare~trip_miles+trip_seconds, data = cleansed_some.trip)
summary(regout2)
## 
## Call:
## lm(formula = fare ~ trip_miles + trip_seconds, data = cleansed_some.trip)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.2407 -1.3693 -0.5315  0.5413 14.4895 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.497261   0.793362   1.887   0.0636 .  
## trip_miles   1.354763   0.126206  10.735 4.93e-16 ***
## trip_seconds 0.010071   0.001229   8.192 1.33e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.741 on 65 degrees of freedom
## Multiple R-squared:  0.924,  Adjusted R-squared:  0.9217 
## F-statistic: 395.4 on 2 and 65 DF,  p-value: < 2.2e-16
#Creating interactive regression model including payment_type
regout3 = lm(fare~trip_miles+trip_seconds+payment_type+trip_miles:trip_seconds, data = cleansed_some.trip)
summary(regout3)
## 
## Call:
## lm(formula = fare ~ trip_miles + trip_seconds + payment_type + 
##     trip_miles:trip_seconds, data = cleansed_some.trip)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.3703 -1.7417 -0.2079  0.9581 12.9518 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              1.3625726  1.0330209   1.319   0.1919    
## trip_miles               1.6045487  0.2477787   6.476 1.62e-08 ***
## trip_seconds             0.0109424  0.0013726   7.972 4.00e-11 ***
## payment_typeCredit Card -1.9864091  0.9437066  -2.105   0.0393 *  
## trip_miles:trip_seconds -0.0001465  0.0001373  -1.067   0.2901    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.624 on 63 degrees of freedom
## Multiple R-squared:  0.9309, Adjusted R-squared:  0.9265 
## F-statistic: 212.2 on 4 and 63 DF,  p-value: < 2.2e-16
#Dump payment_type from my model
regout4 = lm(fare~trip_miles+trip_seconds+trip_miles:trip_seconds, data = cleansed_some.trip)
summary(regout4)
## 
## Call:
## lm(formula = fare ~ trip_miles + trip_seconds + trip_miles:trip_seconds, 
##     data = cleansed_some.trip)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.9633 -1.5052 -0.5098  0.9055 13.5872 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              0.6778424  1.0063994   0.674    0.503    
## trip_miles               1.6438055  0.2536114   6.482 1.50e-08 ***
## trip_seconds             0.0109885  0.0014088   7.800 7.26e-11 ***
## trip_miles:trip_seconds -0.0001834  0.0001398  -1.312    0.194    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.72 on 64 degrees of freedom
## Multiple R-squared:  0.926,  Adjusted R-squared:  0.9226 
## F-statistic: 267.1 on 3 and 64 DF,  p-value: < 2.2e-16
#Adding trip_miles second degree transformation
regout5 = lm(fare~poly(trip_miles, 2)+trip_seconds+trip_miles:trip_seconds+payment_type, data = cleansed_some.trip)
summary(regout5)
## 
## Call:
## lm(formula = fare ~ poly(trip_miles, 2) + trip_seconds + trip_miles:trip_seconds + 
##     payment_type, data = cleansed_some.trip)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.3930 -1.7728 -0.0816  1.3251  9.7879 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              5.6259390  0.8929302   6.301 3.43e-08 ***
## poly(trip_miles, 2)1    94.4988571  9.4789175   9.969 1.68e-14 ***
## poly(trip_miles, 2)2    23.6848393  3.6367078   6.513 1.49e-08 ***
## trip_seconds             0.0159362  0.0013133  12.135  < 2e-16 ***
## payment_typeCredit Card -0.5240151  0.7666557  -0.684    0.497    
## trip_seconds:trip_miles -0.0005704  0.0001250  -4.564 2.43e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.815 on 62 degrees of freedom
## Multiple R-squared:  0.959,  Adjusted R-squared:  0.9557 
## F-statistic: 289.8 on 5 and 62 DF,  p-value: < 2.2e-16
#Adding trip_miles third degree transformation
regout6 = lm(fare~poly(trip_miles, 3)+trip_seconds+trip_miles:trip_seconds+payment_type, data = cleansed_some.trip)
summary(regout6)
## 
## Call:
## lm(formula = fare ~ poly(trip_miles, 3) + trip_seconds + trip_miles:trip_seconds + 
##     payment_type, data = cleansed_some.trip)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.8275 -1.2621  0.1084  0.8840  9.0597 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              6.148e+00  8.638e-01   7.117 1.46e-09 ***
## poly(trip_miles, 3)1     1.097e+02  1.040e+01  10.547 2.24e-15 ***
## poly(trip_miles, 3)2     2.569e+01  3.510e+00   7.320 6.51e-10 ***
## poly(trip_miles, 3)3    -9.056e+00  3.143e+00  -2.881  0.00546 ** 
## trip_seconds             1.666e-02  1.268e-03  13.146  < 2e-16 ***
## payment_typeCredit Card -5.344e-01  7.252e-01  -0.737  0.46398    
## trip_seconds:trip_miles -7.642e-04  1.360e-04  -5.619 5.05e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.663 on 61 degrees of freedom
## Multiple R-squared:  0.9639, Adjusted R-squared:  0.9603 
## F-statistic: 271.3 on 6 and 61 DF,  p-value: < 2.2e-16
#Adding trip_seconds second degree transformation
regout7 = lm(fare~trip_miles+poly(trip_seconds, 2)+trip_miles:trip_seconds+payment_type, data = cleansed_some.trip)
summary(regout7)
## 
## Call:
## lm(formula = fare ~ trip_miles + poly(trip_seconds, 2) + trip_miles:trip_seconds + 
##     payment_type, data = cleansed_some.trip)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.4383 -1.6578 -0.1287  1.0447 13.6949 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              9.990e+00  7.589e-01  13.163  < 2e-16 ***
## trip_miles               1.302e+00  5.059e-01   2.573   0.0125 *  
## poly(trip_seconds, 2)1   4.858e+01  8.705e+00   5.581  5.6e-07 ***
## poly(trip_seconds, 2)2  -5.512e+00  8.012e+00  -0.688   0.4941    
## payment_typeCredit Card -1.944e+00  9.497e-01  -2.047   0.0449 *  
## trip_miles:trip_seconds  3.647e-05  2.996e-04   0.122   0.9035    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.639 on 62 degrees of freedom
## Multiple R-squared:  0.9314, Adjusted R-squared:  0.9259 
## F-statistic: 168.4 on 5 and 62 DF,  p-value: < 2.2e-16
#Adding trip_seconds third degree transformation
regout8 = lm(fare~trip_miles+poly(trip_seconds, 3)+trip_miles:trip_seconds+payment_type, data = cleansed_some.trip)
summary(regout8)
## 
## Call:
## lm(formula = fare ~ trip_miles + poly(trip_seconds, 3) + trip_miles:trip_seconds + 
##     payment_type, data = cleansed_some.trip)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.3247 -1.2698 -0.3621  1.2323 12.0823 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              1.073e+01  7.149e-01  15.011  < 2e-16 ***
## trip_miles               8.378e-01  4.744e-01   1.766 0.082415 .  
## poly(trip_seconds, 3)1   4.953e+01  7.893e+00   6.275 4.01e-08 ***
## poly(trip_seconds, 3)2  -1.045e+01  7.377e+00  -1.417 0.161534    
## poly(trip_seconds, 3)3  -1.347e+01  3.539e+00  -3.805 0.000331 ***
## payment_typeCredit Card -1.995e+00  8.608e-01  -2.318 0.023840 *  
## trip_miles:trip_seconds  2.322e-04  2.764e-04   0.840 0.404201    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.298 on 61 degrees of freedom
## Multiple R-squared:  0.9446, Adjusted R-squared:  0.9391 
## F-statistic: 173.3 on 6 and 61 DF,  p-value: < 2.2e-16
#Adding trip_seconds+trip_miles second degree transformation
regout9 = lm(fare~poly(trip_miles, 2)+poly(trip_seconds, 2)+trip_miles:trip_seconds+payment_type, data = cleansed_some.trip)
summary(regout9)
## 
## Call:
## lm(formula = fare ~ poly(trip_miles, 2) + poly(trip_seconds, 
##     2) + trip_miles:trip_seconds + payment_type, data = cleansed_some.trip)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.8772 -1.3827 -0.1728  0.9072 10.9216 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              2.237e+01  1.604e+00  13.950  < 2e-16 ***
## poly(trip_miles, 2)1     1.531e+02  2.109e+01   7.260 8.27e-10 ***
## poly(trip_miles, 2)2     2.997e+01  3.982e+00   7.527 2.87e-10 ***
## poly(trip_seconds, 2)1   9.749e+01  9.063e+00  10.756 1.02e-15 ***
## poly(trip_seconds, 2)2   2.080e+01  6.786e+00   3.065  0.00324 ** 
## payment_typeCredit Card -2.964e-01  7.233e-01  -0.410  0.68344    
## trip_miles:trip_seconds -1.373e-03  2.871e-04  -4.784 1.13e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.642 on 61 degrees of freedom
## Multiple R-squared:  0.9644, Adjusted R-squared:  0.9609 
## F-statistic: 275.8 on 6 and 61 DF,  p-value: < 2.2e-16
#Adding trip_seconds third degree +trip_miles second degree transformation
regout10 = lm(fare~poly(trip_miles, 3)+poly(trip_seconds, 2)+trip_miles:trip_seconds+payment_type, data = cleansed_some.trip)
summary(regout10)
## 
## Call:
## lm(formula = fare ~ poly(trip_miles, 3) + poly(trip_seconds, 
##     2) + trip_miles:trip_seconds + payment_type, data = cleansed_some.trip)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.3068 -1.0156  0.0303  0.7988 10.2290 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              2.467e+01  1.556e+00  15.847  < 2e-16 ***
## poly(trip_miles, 3)1     1.845e+02  2.059e+01   8.962 1.15e-12 ***
## poly(trip_miles, 3)2     3.381e+01  3.713e+00   9.106 6.55e-13 ***
## poly(trip_miles, 3)3    -1.120e+01  2.854e+00  -3.923 0.000227 ***
## poly(trip_seconds, 2)1   1.063e+02  8.460e+00  12.571  < 2e-16 ***
## poly(trip_seconds, 2)2   2.529e+01  6.210e+00   4.072 0.000139 ***
## payment_typeCredit Card -2.601e-01  6.507e-01  -0.400 0.690819    
## trip_miles:trip_seconds -1.786e-03  2.789e-04  -6.407 2.55e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.376 on 60 degrees of freedom
## Multiple R-squared:  0.9717, Adjusted R-squared:  0.9684 
## F-statistic: 294.3 on 7 and 60 DF,  p-value: < 2.2e-16
#Adding trip_seconds second degree +trip_miles third degree transformation
regout11 = lm(fare~poly(trip_miles, 2)+poly(trip_seconds, 3)+trip_miles:trip_seconds+payment_type, data = cleansed_some.trip)
summary(regout11)
## 
## Call:
## lm(formula = fare ~ poly(trip_miles, 2) + poly(trip_seconds, 
##     3) + trip_miles:trip_seconds + payment_type, data = cleansed_some.trip)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.6747 -0.9464 -0.1599  0.8439 10.8871 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              2.096e+01  1.610e+00  13.022  < 2e-16 ***
## poly(trip_miles, 2)1     1.306e+02  2.167e+01   6.025 1.12e-07 ***
## poly(trip_miles, 2)2     2.670e+01  3.970e+00   6.725 7.35e-09 ***
## poly(trip_seconds, 3)1   9.269e+01  8.792e+00  10.542 2.79e-15 ***
## poly(trip_seconds, 3)2   1.508e+01  6.780e+00   2.224 0.029895 *  
## poly(trip_seconds, 3)3  -7.741e+00  2.826e+00  -2.740 0.008087 ** 
## payment_typeCredit Card -5.059e-01  6.918e-01  -0.731 0.467474    
## trip_miles:trip_seconds -1.107e-03  2.897e-04  -3.820 0.000319 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.511 on 60 degrees of freedom
## Multiple R-squared:  0.9684, Adjusted R-squared:  0.9647 
## F-statistic: 262.6 on 7 and 60 DF,  p-value: < 2.2e-16
#Adding trip_seconds+trip_miles third degree transformation
regout12 = lm(fare~poly(trip_miles, 3)+poly(trip_seconds, 3)+trip_miles:trip_seconds+payment_type, data = cleansed_some.trip)
summary(regout12)
## 
## Call:
## lm(formula = fare ~ poly(trip_miles, 3) + poly(trip_seconds, 
##     3) + trip_miles:trip_seconds + payment_type, data = cleansed_some.trip)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.0588 -0.8740 -0.0631  0.5629 10.3214 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              2.352e+01  1.727e+00  13.623  < 2e-16 ***
## poly(trip_miles, 3)1     1.671e+02  2.358e+01   7.087 1.93e-09 ***
## poly(trip_miles, 3)2     3.140e+01  4.027e+00   7.797 1.21e-10 ***
## poly(trip_miles, 3)3    -9.398e+00  3.081e+00  -3.051  0.00342 ** 
## poly(trip_seconds, 3)1   1.023e+02  8.821e+00  11.596  < 2e-16 ***
## poly(trip_seconds, 3)2   2.143e+01  6.687e+00   3.205  0.00218 ** 
## poly(trip_seconds, 3)3  -4.243e+00  2.886e+00  -1.470  0.14679    
## payment_typeCredit Card -3.807e-01  6.497e-01  -0.586  0.56009    
## trip_miles:trip_seconds -1.574e-03  3.117e-04  -5.049 4.56e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.354 on 59 degrees of freedom
## Multiple R-squared:  0.9727, Adjusted R-squared:  0.969 
## F-statistic: 262.8 on 8 and 59 DF,  p-value: < 2.2e-16
#Dump payment_type from regout11
regout13 = lm(fare~poly(trip_miles, 2)+poly(trip_seconds, 3)+trip_miles:trip_seconds, data = cleansed_some.trip)
summary(regout13)
## 
## Call:
## lm(formula = fare ~ poly(trip_miles, 2) + poly(trip_seconds, 
##     3) + trip_miles:trip_seconds, data = cleansed_some.trip)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.4509 -0.7977 -0.1280  0.7099 10.9672 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              2.104e+01  1.599e+00  13.158  < 2e-16 ***
## poly(trip_miles, 2)1     1.336e+02  2.119e+01   6.302 3.62e-08 ***
## poly(trip_miles, 2)2     2.762e+01  3.746e+00   7.374 5.27e-10 ***
## poly(trip_seconds, 3)1   9.406e+01  8.556e+00  10.994 4.20e-16 ***
## poly(trip_seconds, 3)2   1.573e+01  6.695e+00   2.349 0.022054 *  
## poly(trip_seconds, 3)3  -7.513e+00  2.798e+00  -2.686 0.009315 ** 
## trip_miles:trip_seconds -1.154e-03  2.814e-04  -4.101 0.000124 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.502 on 61 degrees of freedom
## Multiple R-squared:  0.9681, Adjusted R-squared:  0.965 
## F-statistic: 308.7 on 6 and 61 DF,  p-value: < 2.2e-16
#Evaluating the standardized residuals
plot(regout13$fitted.values, rstandard(regout13), pch = 19)
abline(0,0, lwd = 3, col = "red")

#Identify the outliers
boxplot(cleansed_some.trip$fare)$out

## [1] 71.25 37.75 44.75 44.50 39.50 36.75 43.25 34.00 49.00
new.cleaned_trip.dataset = cleansed_some.trip[-which(cleansed_some.trip$fare == max(cleansed_some.trip$fare)),]

regout14 = lm(fare~poly(trip_miles, 2)+poly(trip_seconds, 3)+trip_miles:trip_seconds, data = new.cleaned_trip.dataset)
summary(regout14)
## 
## Call:
## lm(formula = fare ~ poly(trip_miles, 2) + poly(trip_seconds, 
##     3) + trip_miles:trip_seconds, data = new.cleaned_trip.dataset)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.0861 -0.8209 -0.1001  0.5277 10.3973 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              2.112e+01  1.439e+00  14.680  < 2e-16 ***
## poly(trip_miles, 2)1     1.201e+02  1.736e+01   6.917 3.46e-09 ***
## poly(trip_miles, 2)2     2.236e+01  3.576e+00   6.252 4.65e-08 ***
## poly(trip_seconds, 3)1   9.928e+01  8.345e+00  11.896  < 2e-16 ***
## poly(trip_seconds, 3)2   2.016e+01  6.442e+00   3.129  0.00271 ** 
## poly(trip_seconds, 3)3  -5.083e+00  2.674e+00  -1.901  0.06217 .  
## trip_miles:trip_seconds -1.526e-03  2.923e-04  -5.222 2.34e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.353 on 60 degrees of freedom
## Multiple R-squared:  0.9619, Adjusted R-squared:  0.9581 
## F-statistic: 252.5 on 6 and 60 DF,  p-value: < 2.2e-16
plot(regout14$fitted.values, rstandard(regout14), pch = 19)
abline(0, 0, col = "red", lwd = 3)

Discussions and Interpretations:

  1. A simple regression model is executed above with trip_seconds, trip_miles and payment_type as the independent variables, and fare as the dependent variable. Looking at the summary of this simple regression model, regout1, we can make the below interpretations:

4.1 Multiple and Adjusted R-squared values for this model are 0.9296 and 0.9264 respectively, which suggests that we have a good fit as around 92% of the variablity of the dependent variable, fare, has been accounted for in this model. And thus this simple regression model predictions can approximate 92% of the actual fare values. 4.2 The impact of each of the independent variables can be interpreted from the beta-coeff and p-values as explained below. 4.2.1 Intercept: p = 0.0135 suggests that the beta-coeff of the intercept is significant, and in a hypothetical case where the trip_seconds and trip_miles are 0, and there is no payment_type, there will be a fare of 2.050875 units. 4.2.2 trip_miles: p < 2 x 10^(-16) suggests the significance of the beta-coeff of 1.374831, and indicates that for each extra mile, the fare will increase by 1.374831 units. 4.2.3 trip_seconds: p = 3.38 x 10^(-12) suggests the significance of the beta-coeff, and indicates that for each extra second spent in the trip, an extra fare of 0.010218 units will be charged. 4.2.4 payment_typeCredit Card: p = 0.0274 < significance level of 5% for a 95% confidence interval suggests that it has a significant beta-coeff, and indicates that if credit card is used for payment, the fare will reduce by 2.114804 units.

  1. Taking into consideration the interaction between trip_miles and trip_seconds, and performing transformations on these two independent variables, the regression model regout13, where the squared power of trip_miles, cubic power of trip_seconds and the interaction between these two variables are included, and the payment_type is dropped, can be observed to have improved in terms of multiple and adjusted R-squared values of 0.9681 and 0.965 respectively. We can also observe that all the beta-coeff are significant and the median value, -0.1280, is close to 0. But while evaluating the standardized residual, an outlier can be observed in the fitted values vs residuals plot. By plotting a boxplot of the dependent variable, fare, from the cleansed dataset, an outlier can be identified for fare = 71.25. After identifying the outlier, a copy of the cleansed dataset that doesn’t contain the outlier is stored in a new dataset and this dataset is used to run the previously discussed regression model, identified by regout14.
#LINE Assumption
#Linearity Test
plot(new.cleaned_trip.dataset$fare, regout14$fitted.values, pch = 19, main = "Actual vs Fitted Value Plot: Fare")
abline(0, 1, col = "red", lwd = 3)

#Normality Test
qqnorm(rstandard(regout14), main = "Normality Plot of the Residuals")
qqline(rstandard(regout14))

#Equality Test
plot(regout14$fitted.values, rstandard(regout14), pch = 19, main = "Fitted Values vs Residuals Plot: Fare")
abline(0, 0, col = "red", lwd = 3)

Discussions and Interpretations:

  1. After executing the various combinations and getting rid of the outlier as explained above, I have selected the model regout14 as the best fit model, which has a slightly reduced adjusted R-squared value of 0.9581, but my Fitted value vs Residual Plot has improved that can be seen in the above equality test. I have selected this regression model, regout14, on the basis of improved randomness between standardized residuals and fitted values and quality of fit as suggested by an adjusted R-squared value of 0.9581, though slightly lower than regout13, but the standardized residuals now have a better random distribution against the fitted values. The standard R-regression output of this model can be seen above, and as explained above this model’s predictions can approximate 95.81% of the actual fare values, has an improved median value of -0.1001 and has significant beta-coeff of most of the variables. The model confirmity with the LINE assumptions can be interpreted as below: 6.1 Linearity: As we can see above in the Actual vs Fitted Values Plot: Fare, the scattered points are mostly along the line and there is a predominantly linear relationship between the actual and fitted fare values. 6.2 Independence: The variables are independent in this case. 6.3 Normality: As suggested by the above Normality Plot of the Residuals, the residuals are normally distributed for most of the range (-1, 1), the residuals, however, tend to deviate below the qqline for values < -1, and deviate above for the line as we can see for values > 1. But overall, there is a normal relationship between the residuals. 6.4 Equality: The above Fitted Values vs Residuals Plot: Fare suggests that there seem to be no principal pattern between the standardized residuals and the fitted values, and thus the model confirms to the assumption of equal distribution.
#Leverage Points Detection
lev = hat(model.matrix(regout14))
plot(lev, pch = 19, main = "Leverage of Points Plot")
abline(3*mean(lev),0,col="red",lwd=3)

new.cleaned_trip.dataset[lev>(3*mean(lev)),]
#Removing leverage points
reduced_trip.dataset = new.cleaned_trip.dataset[-which(new.cleaned_trip.dataset$trip_seconds == 2040),]
attach(reduced_trip.dataset)
## The following objects are masked from cleansed_some.trip:
## 
##     extras, fare, payment_type, taxi_id, tips, tolls, trip_miles,
##     trip_seconds, trip_total
## The following objects are masked from some.trips:
## 
##     extras, fare, payment_type, taxi_id, tips, tolls, trip_miles,
##     trip_seconds, trip_total
regout_final = lm(fare~poly(trip_miles, 2)+poly(trip_seconds, 3)+trip_miles:trip_seconds, data = reduced_trip.dataset)
summary(regout_final)
## 
## Call:
## lm(formula = fare ~ poly(trip_miles, 2) + poly(trip_seconds, 
##     3) + trip_miles:trip_seconds, data = reduced_trip.dataset)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.2902 -0.9277 -0.0574  1.3578  7.6255 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              2.899e+01  3.072e+00   9.434 2.17e-13 ***
## poly(trip_miles, 2)1     1.957e+02  3.032e+01   6.454 2.25e-08 ***
## poly(trip_miles, 2)2     3.668e+01  5.891e+00   6.226 5.43e-08 ***
## poly(trip_seconds, 3)1   1.568e+02  2.219e+01   7.068 2.07e-09 ***
## poly(trip_seconds, 3)2   5.258e+01  1.250e+01   4.208 8.91e-05 ***
## poly(trip_seconds, 3)3  -7.402e+00  2.551e+00  -2.902   0.0052 ** 
## trip_miles:trip_seconds -3.203e-03  6.286e-04  -5.095 3.86e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.213 on 59 degrees of freedom
## Multiple R-squared:  0.9645, Adjusted R-squared:  0.9609 
## F-statistic: 266.9 on 6 and 59 DF,  p-value: < 2.2e-16

Discussions and Interpretations:

  1. As suggested by the Leverage of Points Plot, there are 4 high leverage points influencing the regression plot. Out of the 4 leverage points, trip_seconds = 2040 seem to be inappropriate when compared to other leverage points, as the taxi covers only 0.90 miles in 2040 seconds. As executed above, the regression model has been modified to use a new reduced dataset without these leverage points, and we can observe that the fit of the final regression model, regout_final has improved as suggested by the improved multiple and adjusted R-squared values of 0.9677 and 0.9642 respectively. The final model can account for 96.42% of the variablity of the dependent variable, fare, and thus the fitted values can approximate 96.42% of the actual fare values. In addition, the model has now a significant beta-coeff for all the independent variables, and a good median value of -0.1295.
#Preparing a new sample dataset
set.seed(71088971)
sample.trips = trips[sample(1:nrow(trips), 100, replace = FALSE),]
attach(sample.trips)
## The following objects are masked from reduced_trip.dataset:
## 
##     extras, fare, payment_type, taxi_id, tips, tolls, trip_miles,
##     trip_seconds, trip_total
## The following objects are masked from cleansed_some.trip:
## 
##     extras, fare, payment_type, taxi_id, tips, tolls, trip_miles,
##     trip_seconds, trip_total
## The following objects are masked from some.trips:
## 
##     extras, fare, payment_type, taxi_id, tips, tolls, trip_miles,
##     trip_seconds, trip_total
#Removing aberrant cases
cleansed_sample.trip = subset(sample.trips, trip_seconds != 0 & trip_miles != 0)
attach(cleansed_sample.trip)
## The following objects are masked from sample.trips:
## 
##     extras, fare, payment_type, taxi_id, tips, tolls, trip_miles,
##     trip_seconds, trip_total
## The following objects are masked from reduced_trip.dataset:
## 
##     extras, fare, payment_type, taxi_id, tips, tolls, trip_miles,
##     trip_seconds, trip_total
## The following objects are masked from cleansed_some.trip:
## 
##     extras, fare, payment_type, taxi_id, tips, tolls, trip_miles,
##     trip_seconds, trip_total
## The following objects are masked from some.trips:
## 
##     extras, fare, payment_type, taxi_id, tips, tolls, trip_miles,
##     trip_seconds, trip_total
#Applying the new sample dataset to the final regression model
regout_final = lm(fare~poly(trip_miles, 2)+poly(trip_seconds, 3)+trip_miles:trip_seconds, data = cleansed_sample.trip)
summary(regout_final)
## 
## Call:
## lm(formula = fare ~ poly(trip_miles, 2) + poly(trip_seconds, 
##     3) + trip_miles:trip_seconds, data = cleansed_sample.trip)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.2266 -1.4475 -0.3000  0.5687 13.3519 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              1.752e+01  7.096e-01  24.690  < 2e-16 ***
## poly(trip_miles, 2)1     1.052e+02  1.227e+01   8.576 2.77e-12 ***
## poly(trip_miles, 2)2     2.848e+01  3.903e+00   7.298 5.15e-10 ***
## poly(trip_seconds, 3)1   8.010e+01  4.554e+00  17.588  < 2e-16 ***
## poly(trip_seconds, 3)2   1.419e-01  3.595e+00   0.039    0.969    
## poly(trip_seconds, 3)3  -4.490e+00  3.417e+00  -1.314    0.193    
## trip_miles:trip_seconds -8.425e-04  1.386e-04  -6.080 7.06e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.692 on 65 degrees of freedom
## Multiple R-squared:  0.9524, Adjusted R-squared:  0.948 
## F-statistic: 216.7 on 6 and 65 DF,  p-value: < 2.2e-16

Discussions and Interpretations

  1. Testing the final regression model, regout_final, against the new sample dataset, cleansed_sample.trip, we can observe that although the adjusted R-squared value has reduced, but the model still has a good fit with adjusted R-squared value of 0.948. And with an adjusted R-squared value of approx 95%, the model still performs well with a new dataset and succeeds to explain 95% of the variability in the dependent variable, fare.