rm(list=ls())
library(rio)
trips = import("6304 Regression Project Data.csv")
colnames(trips) = tolower(make.names(colnames(trips)))
set.seed(71088966)
some.trips = trips[sample(1:nrow(trips), 100, replace = FALSE),]
attach(some.trips)
#View the structure of the dataset
str(some.trips)
## 'data.frame': 100 obs. of 9 variables:
## $ taxi_id : int 197 1615 6969 2271 5414 6785 7131 1490 1015 7746 ...
## $ trip_seconds: int 180 240 600 780 180 120 420 420 120 1080 ...
## $ trip_miles : num 0 0 0.2 2.6 0 0.4 1.4 0 0 0.6 ...
## $ fare : num 5 5.25 10.45 9.75 4.75 ...
## $ tips : num 0 0 0 0 1 2 0 0 0 0 ...
## $ tolls : num 0 0 0 0 0 0 0 0 0 0 ...
## $ extras : num 0 1 0 1 0 0 0 1.5 0 0 ...
## $ trip_total : num 5 6.25 10.45 10.75 5.75 ...
## $ payment_type: chr "Cash" "Credit Card" "Cash" "Cash" ...
#View plots to identify aberrations between trip_seconds/trip_miles and fare
plot(trip_seconds, fare)
abline(0, 0.01, pch = 19, col = "red")
plot(trip_miles, fare)
abline(0, 2, pch = 19, col = "red")
#Removing aberrant cases
cleansed_some.trip = subset(some.trips, trip_seconds != 0 & trip_miles != 0)
attach(cleansed_some.trip)
## The following objects are masked from some.trips:
##
## extras, fare, payment_type, taxi_id, tips, tolls, trip_miles,
## trip_seconds, trip_total
Discussions and Interpretations:
#Summaries and density plots of the continuous variables
#Trip Seconds
summary(trip_seconds)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 120.0 360.0 600.0 774.7 900.0 3240.0
plot(density(trip_seconds), pch = 19, lwd = 3, main = "Trip Seconds Plot")
#Trip Miles
summary(trip_miles)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.100 0.675 1.600 3.916 4.925 30.300
plot(density(trip_miles), pch = 19, lwd = 3, main = "Trip Miles Plot")
#Trip Fare
summary(fare)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.250 6.688 9.250 14.604 16.375 71.250
plot(density(fare), pch = 19, lwd = 3, main = "Trip Fare Plot")
#Trip Tips
summary(tips)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 1.393 2.000 10.600
plot(density(tips), pch = 19, lwd = 3, main = "Trip Tip Plot")
#Trip Extras
summary(extras)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.6103 1.0000 4.5000
plot(density(extras), pch = 19, lwd = 3, main = "Trip Extra Plot")
#Trip Total
summary(trip_total)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.450 7.438 10.250 16.673 19.360 79.750
plot(density(trip_total), pch = 19, lwd = 3, main = "Trip Total Plot")
#Determining the number of cases in each payment type
split(cleansed_some.trip, payment_type)
## $Cash
## taxi_id trip_seconds trip_miles fare tips tolls extras trip_total
## 1323639 6969 600 0.2 10.45 0 0.0 0.0 10.45
## 715370 2271 780 2.6 9.75 0 0.0 1.0 10.75
## 1564888 7131 420 1.4 6.75 0 0.0 0.0 6.75
## 104870 7746 1080 0.6 27.00 0 0.0 0.0 27.00
## 287685 3671 1140 7.6 22.00 0 0.0 0.0 22.00
## 130446 2059 1080 3.6 13.25 0 0.0 0.0 13.25
## 422577 2918 720 0.2 13.50 0 0.0 0.0 13.50
## 404357 904 1860 30.3 71.25 0 4.5 4.0 79.75
## 1422473 3013 600 2.8 10.50 0 0.0 0.0 10.50
## 1561979 2750 600 0.1 10.00 0 0.0 0.0 10.00
## 1374344 4770 660 4.9 15.00 0 0.0 0.0 15.00
## 793977 1025 240 0.6 4.75 0 0.0 0.0 4.75
## 147579 5545 300 1.0 6.25 0 0.0 0.0 6.25
## 1028935 7271 840 2.0 10.00 0 0.0 0.0 10.00
## 698115 8067 240 0.8 5.25 0 0.0 0.0 5.25
## 1108944 97 2040 0.9 37.75 0 0.0 2.0 39.75
## 665641 2618 780 5.0 15.25 0 0.0 1.0 16.25
## 220233 8422 180 0.5 4.50 0 0.0 0.0 4.50
## 1445043 7065 540 3.5 11.75 0 0.0 0.0 11.75
## 1581512 5699 1440 18.0 44.50 0 0.0 1.0 45.50
## 1478674 1561 360 1.1 6.25 0 0.0 2.0 8.25
## 666919 5174 660 0.1 8.50 0 0.0 0.0 8.50
## 303642 6425 300 1.2 6.25 0 0.0 1.0 7.25
## 731782 3601 660 0.1 11.00 0 0.0 0.0 11.00
## 1644228 7478 1740 7.2 36.75 0 0.0 4.5 41.25
## 784234 5900 360 0.7 5.75 0 0.0 0.0 5.75
## 8524 4126 360 0.9 6.00 0 0.0 0.0 6.00
## 1185092 1282 120 0.7 4.45 0 0.0 2.0 6.45
## 862482 1654 300 1.1 6.00 0 0.0 1.0 7.00
## 1542242 7065 780 2.2 8.85 0 0.0 1.0 9.85
## 1236687 890 360 0.6 4.45 0 0.0 0.0 4.45
## 1100139 4605 600 3.0 10.75 0 0.0 0.0 10.75
## 932180 3760 480 1.5 7.50 0 0.0 0.0 7.50
## 1661558 5430 780 7.4 20.75 0 0.0 3.0 23.75
## 817955 4010 120 0.5 4.50 0 0.0 0.0 4.50
## 145196 2640 900 0.1 9.50 0 0.0 0.0 9.50
## 140349 2655 720 0.5 7.25 0 0.0 0.0 7.25
## 444334 2214 540 2.5 9.75 0 0.0 1.0 10.75
## 496366 7578 600 1.7 8.50 0 0.0 0.0 8.50
## 759098 4041 600 2.1 8.75 0 0.0 0.0 8.75
## 1428452 900 300 1.0 5.85 0 0.0 1.0 6.85
## 700432 2953 360 0.1 7.25 0 0.0 1.0 8.25
## 83654 4209 1440 6.1 19.25 0 0.0 0.0 19.25
## 504561 2062 1260 10.5 23.85 0 0.0 1.0 24.85
## payment_type
## 1323639 Cash
## 715370 Cash
## 1564888 Cash
## 104870 Cash
## 287685 Cash
## 130446 Cash
## 422577 Cash
## 404357 Cash
## 1422473 Cash
## 1561979 Cash
## 1374344 Cash
## 793977 Cash
## 147579 Cash
## 1028935 Cash
## 698115 Cash
## 1108944 Cash
## 665641 Cash
## 220233 Cash
## 1445043 Cash
## 1581512 Cash
## 1478674 Cash
## 666919 Cash
## 303642 Cash
## 731782 Cash
## 1644228 Cash
## 784234 Cash
## 8524 Cash
## 1185092 Cash
## 862482 Cash
## 1542242 Cash
## 1236687 Cash
## 1100139 Cash
## 932180 Cash
## 1661558 Cash
## 817955 Cash
## 145196 Cash
## 140349 Cash
## 444334 Cash
## 496366 Cash
## 759098 Cash
## 1428452 Cash
## 700432 Cash
## 83654 Cash
## 504561 Cash
##
## $`Credit Card`
## taxi_id trip_seconds trip_miles fare tips tolls extras
## 138578 6785 120 0.40 4.25 2.00 0 0
## 1587751 2090 900 6.00 18.25 3.65 0 0
## 853491 1107 480 1.40 7.25 1.00 0 0
## 1049599 6064 480 0.12 7.00 4.00 0 0
## 1423020 8687 180 0.90 5.25 1.75 0 0
## 583635 7548 780 2.30 10.25 2.00 0 0
## 1262545 2893 480 1.10 7.25 3.00 0 0
## 1385697 59 1860 17.80 44.75 8.95 0 0
## 1277064 1567 540 2.04 8.25 2.00 0 0
## 644690 579 480 1.80 8.00 2.00 0 0
## 1621811 6993 540 2.40 7.65 2.00 0 0
## 1649766 2048 1740 15.60 39.50 10.00 0 4
## 960012 7264 1200 6.20 18.50 4.87 0 1
## 217295 5884 600 2.80 10.75 2.00 0 0
## 554235 4694 1860 17.40 43.25 8.85 0 1
## 851982 6301 2100 12.40 34.00 7.60 0 4
## 882695 1539 360 0.10 7.25 3.00 0 0
## 144383 2987 1380 8.00 22.50 3.50 0 0
## 1403049 2510 180 0.60 4.75 2.00 0 0
## 159715 6961 720 1.90 9.00 2.00 0 0
## 731034 6824 420 1.20 6.50 2.00 0 0
## 917619 1763 420 1.40 7.00 2.00 0 0
## 139529 5338 3240 17.80 49.00 10.60 0 4
## 1133092 4261 780 5.10 15.75 3.94 0 0
## trip_total payment_type
## 138578 6.25 Credit Card
## 1587751 21.90 Credit Card
## 853491 8.25 Credit Card
## 1049599 11.00 Credit Card
## 1423020 7.00 Credit Card
## 583635 12.25 Credit Card
## 1262545 10.25 Credit Card
## 1385697 53.70 Credit Card
## 1277064 10.25 Credit Card
## 644690 10.00 Credit Card
## 1621811 9.65 Credit Card
## 1649766 53.50 Credit Card
## 960012 24.37 Credit Card
## 217295 12.75 Credit Card
## 554235 53.10 Credit Card
## 851982 45.60 Credit Card
## 882695 10.25 Credit Card
## 144383 26.00 Credit Card
## 1403049 6.75 Credit Card
## 159715 11.00 Credit Card
## 731034 8.50 Credit Card
## 917619 9.00 Credit Card
## 139529 63.60 Credit Card
## 1133092 19.69 Credit Card
#Correlation analysis of continuous variables
continuous_var_some.trip = subset(cleansed_some.trip, select = c("trip_seconds", "trip_miles", "fare", "tips", "tolls", "extras", "trip_total"))
#Creating correlation matrix
library(corrplot)
## corrplot 0.84 loaded
xx = cor(continuous_var_some.trip)
xx
## trip_seconds trip_miles fare tips tolls
## trip_seconds 1.0000000 0.7730028 0.8884842 0.58984374 0.22797164
## trip_miles 0.7730028 1.0000000 0.9195883 0.53326244 0.56894630
## fare 0.8884842 0.9195883 1.0000000 0.48437961 0.52150930
## tips 0.5898437 0.5332624 0.4843796 1.00000000 -0.06692989
## tolls 0.2279716 0.5689463 0.5215093 -0.06692989 1.00000000
## extras 0.6245751 0.6021173 0.6600784 0.34188625 0.35617831
## trip_total 0.8996058 0.9268045 0.9894412 0.59369275 0.49029068
## extras trip_total
## trip_seconds 0.6245751 0.8996058
## trip_miles 0.6021173 0.9268045
## fare 0.6600784 0.9894412
## tips 0.3418862 0.5936928
## tolls 0.3561783 0.4902907
## extras 1.0000000 0.6988474
## trip_total 0.6988474 1.0000000
corrplot(xx,method="circle", type = "lower")
corrplot(xx,method="number", type = "lower")
#Correlation matrix with p values.
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
xx=rcorr(as.matrix(continuous_var_some.trip))
xx
## trip_seconds trip_miles fare tips tolls extras trip_total
## trip_seconds 1.00 0.77 0.89 0.59 0.23 0.62 0.90
## trip_miles 0.77 1.00 0.92 0.53 0.57 0.60 0.93
## fare 0.89 0.92 1.00 0.48 0.52 0.66 0.99
## tips 0.59 0.53 0.48 1.00 -0.07 0.34 0.59
## tolls 0.23 0.57 0.52 -0.07 1.00 0.36 0.49
## extras 0.62 0.60 0.66 0.34 0.36 1.00 0.70
## trip_total 0.90 0.93 0.99 0.59 0.49 0.70 1.00
##
## n= 68
##
##
## P
## trip_seconds trip_miles fare tips tolls extras
## trip_seconds 0.0000 0.0000 0.0000 0.0615 0.0000
## trip_miles 0.0000 0.0000 0.0000 0.0000 0.0000
## fare 0.0000 0.0000 0.0000 0.0000 0.0000
## tips 0.0000 0.0000 0.0000 0.5876 0.0043
## tolls 0.0615 0.0000 0.0000 0.5876 0.0029
## extras 0.0000 0.0000 0.0000 0.0043 0.0029
## trip_total 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
## trip_total
## trip_seconds 0.0000
## trip_miles 0.0000
## fare 0.0000
## tips 0.0000
## tolls 0.0000
## extras 0.0000
## trip_total
Discussions and Interpretations:
You can see above the summaries and density plots of all the continuous variables, except taxi_id. As we can see from the density plots of the continuous variables, all these continuous variables are right skewed, but these variables differ in terms of the spread of their distribution. trip_seconds for example is distributed over 0 - 3000s with most of the data distributed around 0 - 1000s, whereas the trip_miles is distributed over 0 - 30 miles and most of the data distributed around 0 - 10 miles. fare and trip_total have a similar shape and spread ranging over 0 - 80, differing from tip and extras that spread over (-2 - 12) and (-1 - 5) respectively.
As executed above, we can see the table of the number of cases in each level of the payment_type.
As executed above, we can see the correlation matrix between the continuous variables and correlation matrix with the respective p-values. The above correlation matrix can lead to the below observations: 3.1 Correlation coefficients > +0.50, suggest a strong positive correlation between most of the independent continuous variables. The strongest correlation and their significance is described below. 3.1.1 fare and trip_total have the strongest positive correlation, with a correlation coeff, R = +0.99. Their correlation is significant as suggested by a p-value of 0. 3.1.2 trip_miles and trip_total have a strong and significant positive correlation as suggested by R = +0.93 and p = 0. 3.1.3 R = +0.92 and p = 0 suggest a strong and significant positive correlation 3.1.4 trip_seconds and trip_total also have a strong and significant positive correlation with R = +0.90 and p = 0. 3.2 trip_seconds and extras have a strong positive correlation with R = +0.62, but the correlation is not significant as p = 0.0615 > significance level of 5%. 3.3 trip_seconds and tolls have a positive but weak correlation with R = +0.23. 3.4 tolls and tips have a weak negative correlation with R = -0.07, and the correlation isn’t significant as suggested by p = 0.5876. 3.5 These observations can be interpreted by the above circle correlation plot, where the size and color of the circle suggest the correlation between two variables. For instance, the largest circle, with the darkest color is there in the cell representing the fare and trip_total, suggesting the strongest positive correlation. It is better illustrated by the number correlation plot, where the correlation coeff, R, is present in each cell for a pair of variables and suggests the correlation for each pair of independent variables.
#Creating a simple regression model
regout1 = lm(fare~trip_miles+trip_seconds+payment_type, data = cleansed_some.trip)
summary(regout1)
##
## Call:
## lm(formula = fare ~ trip_miles + trip_seconds + payment_type,
## data = cleansed_some.trip)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.5143 -1.5784 -0.4741 0.8758 13.6171
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.050875 0.807649 2.539 0.0135 *
## trip_miles 1.374831 0.122732 11.202 < 2e-16 ***
## trip_seconds 0.010218 0.001194 8.556 3.38e-12 ***
## payment_typeCredit Card -2.114804 0.937011 -2.257 0.0274 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.628 on 64 degrees of freedom
## Multiple R-squared: 0.9296, Adjusted R-squared: 0.9264
## F-statistic: 281.9 on 3 and 64 DF, p-value: < 2.2e-16
#Dump payment_type from my model
regout2 = lm(fare~trip_miles+trip_seconds, data = cleansed_some.trip)
summary(regout2)
##
## Call:
## lm(formula = fare ~ trip_miles + trip_seconds, data = cleansed_some.trip)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.2407 -1.3693 -0.5315 0.5413 14.4895
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.497261 0.793362 1.887 0.0636 .
## trip_miles 1.354763 0.126206 10.735 4.93e-16 ***
## trip_seconds 0.010071 0.001229 8.192 1.33e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.741 on 65 degrees of freedom
## Multiple R-squared: 0.924, Adjusted R-squared: 0.9217
## F-statistic: 395.4 on 2 and 65 DF, p-value: < 2.2e-16
#Creating interactive regression model including payment_type
regout3 = lm(fare~trip_miles+trip_seconds+payment_type+trip_miles:trip_seconds, data = cleansed_some.trip)
summary(regout3)
##
## Call:
## lm(formula = fare ~ trip_miles + trip_seconds + payment_type +
## trip_miles:trip_seconds, data = cleansed_some.trip)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.3703 -1.7417 -0.2079 0.9581 12.9518
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.3625726 1.0330209 1.319 0.1919
## trip_miles 1.6045487 0.2477787 6.476 1.62e-08 ***
## trip_seconds 0.0109424 0.0013726 7.972 4.00e-11 ***
## payment_typeCredit Card -1.9864091 0.9437066 -2.105 0.0393 *
## trip_miles:trip_seconds -0.0001465 0.0001373 -1.067 0.2901
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.624 on 63 degrees of freedom
## Multiple R-squared: 0.9309, Adjusted R-squared: 0.9265
## F-statistic: 212.2 on 4 and 63 DF, p-value: < 2.2e-16
#Dump payment_type from my model
regout4 = lm(fare~trip_miles+trip_seconds+trip_miles:trip_seconds, data = cleansed_some.trip)
summary(regout4)
##
## Call:
## lm(formula = fare ~ trip_miles + trip_seconds + trip_miles:trip_seconds,
## data = cleansed_some.trip)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.9633 -1.5052 -0.5098 0.9055 13.5872
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.6778424 1.0063994 0.674 0.503
## trip_miles 1.6438055 0.2536114 6.482 1.50e-08 ***
## trip_seconds 0.0109885 0.0014088 7.800 7.26e-11 ***
## trip_miles:trip_seconds -0.0001834 0.0001398 -1.312 0.194
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.72 on 64 degrees of freedom
## Multiple R-squared: 0.926, Adjusted R-squared: 0.9226
## F-statistic: 267.1 on 3 and 64 DF, p-value: < 2.2e-16
#Adding trip_miles second degree transformation
regout5 = lm(fare~poly(trip_miles, 2)+trip_seconds+trip_miles:trip_seconds+payment_type, data = cleansed_some.trip)
summary(regout5)
##
## Call:
## lm(formula = fare ~ poly(trip_miles, 2) + trip_seconds + trip_miles:trip_seconds +
## payment_type, data = cleansed_some.trip)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.3930 -1.7728 -0.0816 1.3251 9.7879
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.6259390 0.8929302 6.301 3.43e-08 ***
## poly(trip_miles, 2)1 94.4988571 9.4789175 9.969 1.68e-14 ***
## poly(trip_miles, 2)2 23.6848393 3.6367078 6.513 1.49e-08 ***
## trip_seconds 0.0159362 0.0013133 12.135 < 2e-16 ***
## payment_typeCredit Card -0.5240151 0.7666557 -0.684 0.497
## trip_seconds:trip_miles -0.0005704 0.0001250 -4.564 2.43e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.815 on 62 degrees of freedom
## Multiple R-squared: 0.959, Adjusted R-squared: 0.9557
## F-statistic: 289.8 on 5 and 62 DF, p-value: < 2.2e-16
#Adding trip_miles third degree transformation
regout6 = lm(fare~poly(trip_miles, 3)+trip_seconds+trip_miles:trip_seconds+payment_type, data = cleansed_some.trip)
summary(regout6)
##
## Call:
## lm(formula = fare ~ poly(trip_miles, 3) + trip_seconds + trip_miles:trip_seconds +
## payment_type, data = cleansed_some.trip)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.8275 -1.2621 0.1084 0.8840 9.0597
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.148e+00 8.638e-01 7.117 1.46e-09 ***
## poly(trip_miles, 3)1 1.097e+02 1.040e+01 10.547 2.24e-15 ***
## poly(trip_miles, 3)2 2.569e+01 3.510e+00 7.320 6.51e-10 ***
## poly(trip_miles, 3)3 -9.056e+00 3.143e+00 -2.881 0.00546 **
## trip_seconds 1.666e-02 1.268e-03 13.146 < 2e-16 ***
## payment_typeCredit Card -5.344e-01 7.252e-01 -0.737 0.46398
## trip_seconds:trip_miles -7.642e-04 1.360e-04 -5.619 5.05e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.663 on 61 degrees of freedom
## Multiple R-squared: 0.9639, Adjusted R-squared: 0.9603
## F-statistic: 271.3 on 6 and 61 DF, p-value: < 2.2e-16
#Adding trip_seconds second degree transformation
regout7 = lm(fare~trip_miles+poly(trip_seconds, 2)+trip_miles:trip_seconds+payment_type, data = cleansed_some.trip)
summary(regout7)
##
## Call:
## lm(formula = fare ~ trip_miles + poly(trip_seconds, 2) + trip_miles:trip_seconds +
## payment_type, data = cleansed_some.trip)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.4383 -1.6578 -0.1287 1.0447 13.6949
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.990e+00 7.589e-01 13.163 < 2e-16 ***
## trip_miles 1.302e+00 5.059e-01 2.573 0.0125 *
## poly(trip_seconds, 2)1 4.858e+01 8.705e+00 5.581 5.6e-07 ***
## poly(trip_seconds, 2)2 -5.512e+00 8.012e+00 -0.688 0.4941
## payment_typeCredit Card -1.944e+00 9.497e-01 -2.047 0.0449 *
## trip_miles:trip_seconds 3.647e-05 2.996e-04 0.122 0.9035
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.639 on 62 degrees of freedom
## Multiple R-squared: 0.9314, Adjusted R-squared: 0.9259
## F-statistic: 168.4 on 5 and 62 DF, p-value: < 2.2e-16
#Adding trip_seconds third degree transformation
regout8 = lm(fare~trip_miles+poly(trip_seconds, 3)+trip_miles:trip_seconds+payment_type, data = cleansed_some.trip)
summary(regout8)
##
## Call:
## lm(formula = fare ~ trip_miles + poly(trip_seconds, 3) + trip_miles:trip_seconds +
## payment_type, data = cleansed_some.trip)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.3247 -1.2698 -0.3621 1.2323 12.0823
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.073e+01 7.149e-01 15.011 < 2e-16 ***
## trip_miles 8.378e-01 4.744e-01 1.766 0.082415 .
## poly(trip_seconds, 3)1 4.953e+01 7.893e+00 6.275 4.01e-08 ***
## poly(trip_seconds, 3)2 -1.045e+01 7.377e+00 -1.417 0.161534
## poly(trip_seconds, 3)3 -1.347e+01 3.539e+00 -3.805 0.000331 ***
## payment_typeCredit Card -1.995e+00 8.608e-01 -2.318 0.023840 *
## trip_miles:trip_seconds 2.322e-04 2.764e-04 0.840 0.404201
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.298 on 61 degrees of freedom
## Multiple R-squared: 0.9446, Adjusted R-squared: 0.9391
## F-statistic: 173.3 on 6 and 61 DF, p-value: < 2.2e-16
#Adding trip_seconds+trip_miles second degree transformation
regout9 = lm(fare~poly(trip_miles, 2)+poly(trip_seconds, 2)+trip_miles:trip_seconds+payment_type, data = cleansed_some.trip)
summary(regout9)
##
## Call:
## lm(formula = fare ~ poly(trip_miles, 2) + poly(trip_seconds,
## 2) + trip_miles:trip_seconds + payment_type, data = cleansed_some.trip)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.8772 -1.3827 -0.1728 0.9072 10.9216
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.237e+01 1.604e+00 13.950 < 2e-16 ***
## poly(trip_miles, 2)1 1.531e+02 2.109e+01 7.260 8.27e-10 ***
## poly(trip_miles, 2)2 2.997e+01 3.982e+00 7.527 2.87e-10 ***
## poly(trip_seconds, 2)1 9.749e+01 9.063e+00 10.756 1.02e-15 ***
## poly(trip_seconds, 2)2 2.080e+01 6.786e+00 3.065 0.00324 **
## payment_typeCredit Card -2.964e-01 7.233e-01 -0.410 0.68344
## trip_miles:trip_seconds -1.373e-03 2.871e-04 -4.784 1.13e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.642 on 61 degrees of freedom
## Multiple R-squared: 0.9644, Adjusted R-squared: 0.9609
## F-statistic: 275.8 on 6 and 61 DF, p-value: < 2.2e-16
#Adding trip_seconds third degree +trip_miles second degree transformation
regout10 = lm(fare~poly(trip_miles, 3)+poly(trip_seconds, 2)+trip_miles:trip_seconds+payment_type, data = cleansed_some.trip)
summary(regout10)
##
## Call:
## lm(formula = fare ~ poly(trip_miles, 3) + poly(trip_seconds,
## 2) + trip_miles:trip_seconds + payment_type, data = cleansed_some.trip)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.3068 -1.0156 0.0303 0.7988 10.2290
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.467e+01 1.556e+00 15.847 < 2e-16 ***
## poly(trip_miles, 3)1 1.845e+02 2.059e+01 8.962 1.15e-12 ***
## poly(trip_miles, 3)2 3.381e+01 3.713e+00 9.106 6.55e-13 ***
## poly(trip_miles, 3)3 -1.120e+01 2.854e+00 -3.923 0.000227 ***
## poly(trip_seconds, 2)1 1.063e+02 8.460e+00 12.571 < 2e-16 ***
## poly(trip_seconds, 2)2 2.529e+01 6.210e+00 4.072 0.000139 ***
## payment_typeCredit Card -2.601e-01 6.507e-01 -0.400 0.690819
## trip_miles:trip_seconds -1.786e-03 2.789e-04 -6.407 2.55e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.376 on 60 degrees of freedom
## Multiple R-squared: 0.9717, Adjusted R-squared: 0.9684
## F-statistic: 294.3 on 7 and 60 DF, p-value: < 2.2e-16
#Adding trip_seconds second degree +trip_miles third degree transformation
regout11 = lm(fare~poly(trip_miles, 2)+poly(trip_seconds, 3)+trip_miles:trip_seconds+payment_type, data = cleansed_some.trip)
summary(regout11)
##
## Call:
## lm(formula = fare ~ poly(trip_miles, 2) + poly(trip_seconds,
## 3) + trip_miles:trip_seconds + payment_type, data = cleansed_some.trip)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.6747 -0.9464 -0.1599 0.8439 10.8871
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.096e+01 1.610e+00 13.022 < 2e-16 ***
## poly(trip_miles, 2)1 1.306e+02 2.167e+01 6.025 1.12e-07 ***
## poly(trip_miles, 2)2 2.670e+01 3.970e+00 6.725 7.35e-09 ***
## poly(trip_seconds, 3)1 9.269e+01 8.792e+00 10.542 2.79e-15 ***
## poly(trip_seconds, 3)2 1.508e+01 6.780e+00 2.224 0.029895 *
## poly(trip_seconds, 3)3 -7.741e+00 2.826e+00 -2.740 0.008087 **
## payment_typeCredit Card -5.059e-01 6.918e-01 -0.731 0.467474
## trip_miles:trip_seconds -1.107e-03 2.897e-04 -3.820 0.000319 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.511 on 60 degrees of freedom
## Multiple R-squared: 0.9684, Adjusted R-squared: 0.9647
## F-statistic: 262.6 on 7 and 60 DF, p-value: < 2.2e-16
#Adding trip_seconds+trip_miles third degree transformation
regout12 = lm(fare~poly(trip_miles, 3)+poly(trip_seconds, 3)+trip_miles:trip_seconds+payment_type, data = cleansed_some.trip)
summary(regout12)
##
## Call:
## lm(formula = fare ~ poly(trip_miles, 3) + poly(trip_seconds,
## 3) + trip_miles:trip_seconds + payment_type, data = cleansed_some.trip)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.0588 -0.8740 -0.0631 0.5629 10.3214
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.352e+01 1.727e+00 13.623 < 2e-16 ***
## poly(trip_miles, 3)1 1.671e+02 2.358e+01 7.087 1.93e-09 ***
## poly(trip_miles, 3)2 3.140e+01 4.027e+00 7.797 1.21e-10 ***
## poly(trip_miles, 3)3 -9.398e+00 3.081e+00 -3.051 0.00342 **
## poly(trip_seconds, 3)1 1.023e+02 8.821e+00 11.596 < 2e-16 ***
## poly(trip_seconds, 3)2 2.143e+01 6.687e+00 3.205 0.00218 **
## poly(trip_seconds, 3)3 -4.243e+00 2.886e+00 -1.470 0.14679
## payment_typeCredit Card -3.807e-01 6.497e-01 -0.586 0.56009
## trip_miles:trip_seconds -1.574e-03 3.117e-04 -5.049 4.56e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.354 on 59 degrees of freedom
## Multiple R-squared: 0.9727, Adjusted R-squared: 0.969
## F-statistic: 262.8 on 8 and 59 DF, p-value: < 2.2e-16
#Dump payment_type from regout11
regout13 = lm(fare~poly(trip_miles, 2)+poly(trip_seconds, 3)+trip_miles:trip_seconds, data = cleansed_some.trip)
summary(regout13)
##
## Call:
## lm(formula = fare ~ poly(trip_miles, 2) + poly(trip_seconds,
## 3) + trip_miles:trip_seconds, data = cleansed_some.trip)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.4509 -0.7977 -0.1280 0.7099 10.9672
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.104e+01 1.599e+00 13.158 < 2e-16 ***
## poly(trip_miles, 2)1 1.336e+02 2.119e+01 6.302 3.62e-08 ***
## poly(trip_miles, 2)2 2.762e+01 3.746e+00 7.374 5.27e-10 ***
## poly(trip_seconds, 3)1 9.406e+01 8.556e+00 10.994 4.20e-16 ***
## poly(trip_seconds, 3)2 1.573e+01 6.695e+00 2.349 0.022054 *
## poly(trip_seconds, 3)3 -7.513e+00 2.798e+00 -2.686 0.009315 **
## trip_miles:trip_seconds -1.154e-03 2.814e-04 -4.101 0.000124 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.502 on 61 degrees of freedom
## Multiple R-squared: 0.9681, Adjusted R-squared: 0.965
## F-statistic: 308.7 on 6 and 61 DF, p-value: < 2.2e-16
#Evaluating the standardized residuals
plot(regout13$fitted.values, rstandard(regout13), pch = 19)
abline(0,0, lwd = 3, col = "red")
#Identify the outliers
boxplot(cleansed_some.trip$fare)$out
## [1] 71.25 37.75 44.75 44.50 39.50 36.75 43.25 34.00 49.00
new.cleaned_trip.dataset = cleansed_some.trip[-which(cleansed_some.trip$fare == max(cleansed_some.trip$fare)),]
regout14 = lm(fare~poly(trip_miles, 2)+poly(trip_seconds, 3)+trip_miles:trip_seconds, data = new.cleaned_trip.dataset)
summary(regout14)
##
## Call:
## lm(formula = fare ~ poly(trip_miles, 2) + poly(trip_seconds,
## 3) + trip_miles:trip_seconds, data = new.cleaned_trip.dataset)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.0861 -0.8209 -0.1001 0.5277 10.3973
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.112e+01 1.439e+00 14.680 < 2e-16 ***
## poly(trip_miles, 2)1 1.201e+02 1.736e+01 6.917 3.46e-09 ***
## poly(trip_miles, 2)2 2.236e+01 3.576e+00 6.252 4.65e-08 ***
## poly(trip_seconds, 3)1 9.928e+01 8.345e+00 11.896 < 2e-16 ***
## poly(trip_seconds, 3)2 2.016e+01 6.442e+00 3.129 0.00271 **
## poly(trip_seconds, 3)3 -5.083e+00 2.674e+00 -1.901 0.06217 .
## trip_miles:trip_seconds -1.526e-03 2.923e-04 -5.222 2.34e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.353 on 60 degrees of freedom
## Multiple R-squared: 0.9619, Adjusted R-squared: 0.9581
## F-statistic: 252.5 on 6 and 60 DF, p-value: < 2.2e-16
plot(regout14$fitted.values, rstandard(regout14), pch = 19)
abline(0, 0, col = "red", lwd = 3)
Discussions and Interpretations:
4.1 Multiple and Adjusted R-squared values for this model are 0.9296 and 0.9264 respectively, which suggests that we have a good fit as around 92% of the variablity of the dependent variable, fare, has been accounted for in this model. And thus this simple regression model predictions can approximate 92% of the actual fare values. 4.2 The impact of each of the independent variables can be interpreted from the beta-coeff and p-values as explained below. 4.2.1 Intercept: p = 0.0135 suggests that the beta-coeff of the intercept is significant, and in a hypothetical case where the trip_seconds and trip_miles are 0, and there is no payment_type, there will be a fare of 2.050875 units. 4.2.2 trip_miles: p < 2 x 10^(-16) suggests the significance of the beta-coeff of 1.374831, and indicates that for each extra mile, the fare will increase by 1.374831 units. 4.2.3 trip_seconds: p = 3.38 x 10^(-12) suggests the significance of the beta-coeff, and indicates that for each extra second spent in the trip, an extra fare of 0.010218 units will be charged. 4.2.4 payment_typeCredit Card: p = 0.0274 < significance level of 5% for a 95% confidence interval suggests that it has a significant beta-coeff, and indicates that if credit card is used for payment, the fare will reduce by 2.114804 units.
#LINE Assumption
#Linearity Test
plot(new.cleaned_trip.dataset$fare, regout14$fitted.values, pch = 19, main = "Actual vs Fitted Value Plot: Fare")
abline(0, 1, col = "red", lwd = 3)
#Normality Test
qqnorm(rstandard(regout14), main = "Normality Plot of the Residuals")
qqline(rstandard(regout14))
#Equality Test
plot(regout14$fitted.values, rstandard(regout14), pch = 19, main = "Fitted Values vs Residuals Plot: Fare")
abline(0, 0, col = "red", lwd = 3)
Discussions and Interpretations:
#Leverage Points Detection
lev = hat(model.matrix(regout14))
plot(lev, pch = 19, main = "Leverage of Points Plot")
abline(3*mean(lev),0,col="red",lwd=3)
new.cleaned_trip.dataset[lev>(3*mean(lev)),]
#Removing leverage points
reduced_trip.dataset = new.cleaned_trip.dataset[-which(new.cleaned_trip.dataset$trip_seconds == 2040),]
attach(reduced_trip.dataset)
## The following objects are masked from cleansed_some.trip:
##
## extras, fare, payment_type, taxi_id, tips, tolls, trip_miles,
## trip_seconds, trip_total
## The following objects are masked from some.trips:
##
## extras, fare, payment_type, taxi_id, tips, tolls, trip_miles,
## trip_seconds, trip_total
regout_final = lm(fare~poly(trip_miles, 2)+poly(trip_seconds, 3)+trip_miles:trip_seconds, data = reduced_trip.dataset)
summary(regout_final)
##
## Call:
## lm(formula = fare ~ poly(trip_miles, 2) + poly(trip_seconds,
## 3) + trip_miles:trip_seconds, data = reduced_trip.dataset)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.2902 -0.9277 -0.0574 1.3578 7.6255
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.899e+01 3.072e+00 9.434 2.17e-13 ***
## poly(trip_miles, 2)1 1.957e+02 3.032e+01 6.454 2.25e-08 ***
## poly(trip_miles, 2)2 3.668e+01 5.891e+00 6.226 5.43e-08 ***
## poly(trip_seconds, 3)1 1.568e+02 2.219e+01 7.068 2.07e-09 ***
## poly(trip_seconds, 3)2 5.258e+01 1.250e+01 4.208 8.91e-05 ***
## poly(trip_seconds, 3)3 -7.402e+00 2.551e+00 -2.902 0.0052 **
## trip_miles:trip_seconds -3.203e-03 6.286e-04 -5.095 3.86e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.213 on 59 degrees of freedom
## Multiple R-squared: 0.9645, Adjusted R-squared: 0.9609
## F-statistic: 266.9 on 6 and 59 DF, p-value: < 2.2e-16
Discussions and Interpretations:
#Preparing a new sample dataset
set.seed(71088971)
sample.trips = trips[sample(1:nrow(trips), 100, replace = FALSE),]
attach(sample.trips)
## The following objects are masked from reduced_trip.dataset:
##
## extras, fare, payment_type, taxi_id, tips, tolls, trip_miles,
## trip_seconds, trip_total
## The following objects are masked from cleansed_some.trip:
##
## extras, fare, payment_type, taxi_id, tips, tolls, trip_miles,
## trip_seconds, trip_total
## The following objects are masked from some.trips:
##
## extras, fare, payment_type, taxi_id, tips, tolls, trip_miles,
## trip_seconds, trip_total
#Removing aberrant cases
cleansed_sample.trip = subset(sample.trips, trip_seconds != 0 & trip_miles != 0)
attach(cleansed_sample.trip)
## The following objects are masked from sample.trips:
##
## extras, fare, payment_type, taxi_id, tips, tolls, trip_miles,
## trip_seconds, trip_total
## The following objects are masked from reduced_trip.dataset:
##
## extras, fare, payment_type, taxi_id, tips, tolls, trip_miles,
## trip_seconds, trip_total
## The following objects are masked from cleansed_some.trip:
##
## extras, fare, payment_type, taxi_id, tips, tolls, trip_miles,
## trip_seconds, trip_total
## The following objects are masked from some.trips:
##
## extras, fare, payment_type, taxi_id, tips, tolls, trip_miles,
## trip_seconds, trip_total
#Applying the new sample dataset to the final regression model
regout_final = lm(fare~poly(trip_miles, 2)+poly(trip_seconds, 3)+trip_miles:trip_seconds, data = cleansed_sample.trip)
summary(regout_final)
##
## Call:
## lm(formula = fare ~ poly(trip_miles, 2) + poly(trip_seconds,
## 3) + trip_miles:trip_seconds, data = cleansed_sample.trip)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.2266 -1.4475 -0.3000 0.5687 13.3519
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.752e+01 7.096e-01 24.690 < 2e-16 ***
## poly(trip_miles, 2)1 1.052e+02 1.227e+01 8.576 2.77e-12 ***
## poly(trip_miles, 2)2 2.848e+01 3.903e+00 7.298 5.15e-10 ***
## poly(trip_seconds, 3)1 8.010e+01 4.554e+00 17.588 < 2e-16 ***
## poly(trip_seconds, 3)2 1.419e-01 3.595e+00 0.039 0.969
## poly(trip_seconds, 3)3 -4.490e+00 3.417e+00 -1.314 0.193
## trip_miles:trip_seconds -8.425e-04 1.386e-04 -6.080 7.06e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.692 on 65 degrees of freedom
## Multiple R-squared: 0.9524, Adjusted R-squared: 0.948
## F-statistic: 216.7 on 6 and 65 DF, p-value: < 2.2e-16
Discussions and Interpretations