10/16/2018

Fares and Distance

##   fare_amount         CrowD        
##  Min.   :  0.00   Min.   : 0.0000  
##  1st Qu.:  6.00   1st Qu.: 0.5346  
##  Median :  8.50   Median : 0.9583  
##  Mean   : 11.34   Mean   : 1.6743  
##  3rd Qu.: 12.50   3rd Qu.: 1.7634  
##  Max.   :205.33   Max.   :28.8069  
##                   NA's   :1126

Fares and Distance Filtered

summary(sample_taxi_train %>% 
    filter(fare_amount >= 2.50, CrowD > 0.25) %>% 
      select(fare_amount, CrowD))
##   fare_amount         CrowD        
##  Min.   :  2.50   Min.   : 0.2501  
##  1st Qu.:  6.50   1st Qu.: 0.6237  
##  Median :  8.90   Median : 1.0456  
##  Mean   : 11.68   Mean   : 1.8016  
##  3rd Qu.: 13.00   3rd Qu.: 1.8644  
##  Max.   :205.33   Max.   :28.8069

Fares and Distance Plotted

z-score function

zcore <- function(x){
  (x - mean(x, na.rm=TRUE)) / sd(x, na.rm=TRUE)
}

Fares and Distances Scaled

sample_taxi_train <- sample_taxi_train %>% 
  mutate(fare.z = zcore(fare_amount), CrowD.z = zcore(CrowD)) 
summary(sample_taxi_train %>% 
          select(fare_amount, CrowD, fare.z, CrowD.z))
##   fare_amount         CrowD             fare.z           CrowD.z        
##  Min.   :  2.50   Min.   : 0.2501   Min.   :-0.9722   Min.   :-0.65172  
##  1st Qu.:  6.50   1st Qu.: 0.6237   1st Qu.:-0.5485   1st Qu.:-0.49481  
##  Median :  8.90   Median : 1.0456   Median :-0.2944   Median :-0.31757  
##  Mean   : 11.68   Mean   : 1.8016   Mean   : 0.0000   Mean   : 0.00000  
##  3rd Qu.: 13.00   3rd Qu.: 1.8644   3rd Qu.: 0.1398   3rd Qu.: 0.02638  
##  Max.   :205.33   Max.   :28.8069   Max.   :20.5082   Max.   :11.34387

Fare and Distance z-scores Plotted

Fare and Distance Correlations

sample_taxi_train %>% 
  summarize(r = cor(fare_amount, CrowD), r.z= cor(fare.z, CrowD.z))
##           r       r.z
## 1 0.8581072 0.8581072

Fares and Distance z-scores Plotted v.2

sample_taxi_train %>% filter(fare_amount >= 2.50, CrowD > 0.25) %>% 
  ggplot(aes(CrowD.z, fare.z)) + geom_point(size=0.1) +
  geom_abline(slope=0.8581072, intercept=0, color="red")

A Little Algebra

\(y = m \cdot x + b\)

For the best fit line: \(y_z = r \cdot x_z\)

\(\frac{y - \mu_y}{\sigma_y} = r \cdot \frac{x - \mu_x}{\sigma_x}\)

\(y - \mu_y = r \cdot (x - \mu_x)\cdot\frac{\sigma_y}{\sigma_x}\)

\(y = r \cdot \frac{\sigma_y}{\sigma_x} x + (\mu_y- \mu_x \cdot r \cdot \frac{\sigma_y}{\sigma_x})\)

\(m = r \cdot \frac{\sigma_y}{\sigma_x}\)

\(b = \mu_y- \mu_x\cdot\frac{\sigma_y}{\sigma_x}\)

New Functions

best_fit_slope <- function(x,y){
  cor(x,y)*(sd(y)/sd(x))
  }

best_fit_intercept <- function(x,y){
  mean(y) - mean(x)*cor(x,y)*(sd(y)/sd(x))
}

Predicting Fares from Distance

sample_taxi_train %>% 
  summarize(m = best_fit_slope(CrowD, fare_amount), 
            b = best_fit_intercept(CrowD, fare_amount))
##          m        b
## 1 3.403645 5.547617

Predicting Fares from Distance (Built in)

lm(fare_amount ~ CrowD, data=sample_taxi_train)
## 
## Call:
## lm(formula = fare_amount ~ CrowD, data = sample_taxi_train)
## 
## Coefficients:
## (Intercept)        CrowD  
##       5.548        3.404

Predictions

m <- lm(fare_amount ~ CrowD, data=sample_taxi_train)
predict(m, test)