## fare_amount CrowD ## Min. : 0.00 Min. : 0.0000 ## 1st Qu.: 6.00 1st Qu.: 0.5346 ## Median : 8.50 Median : 0.9583 ## Mean : 11.34 Mean : 1.6743 ## 3rd Qu.: 12.50 3rd Qu.: 1.7634 ## Max. :205.33 Max. :28.8069 ## NA's :1126
10/16/2018
## fare_amount CrowD ## Min. : 0.00 Min. : 0.0000 ## 1st Qu.: 6.00 1st Qu.: 0.5346 ## Median : 8.50 Median : 0.9583 ## Mean : 11.34 Mean : 1.6743 ## 3rd Qu.: 12.50 3rd Qu.: 1.7634 ## Max. :205.33 Max. :28.8069 ## NA's :1126
summary(sample_taxi_train %>% filter(fare_amount >= 2.50, CrowD > 0.25) %>% select(fare_amount, CrowD))
## fare_amount CrowD ## Min. : 2.50 Min. : 0.2501 ## 1st Qu.: 6.50 1st Qu.: 0.6237 ## Median : 8.90 Median : 1.0456 ## Mean : 11.68 Mean : 1.8016 ## 3rd Qu.: 13.00 3rd Qu.: 1.8644 ## Max. :205.33 Max. :28.8069
zcore <- function(x){ (x - mean(x, na.rm=TRUE)) / sd(x, na.rm=TRUE) }
sample_taxi_train <- sample_taxi_train %>% mutate(fare.z = zcore(fare_amount), CrowD.z = zcore(CrowD)) summary(sample_taxi_train %>% select(fare_amount, CrowD, fare.z, CrowD.z))
## fare_amount CrowD fare.z CrowD.z ## Min. : 2.50 Min. : 0.2501 Min. :-0.9722 Min. :-0.65172 ## 1st Qu.: 6.50 1st Qu.: 0.6237 1st Qu.:-0.5485 1st Qu.:-0.49481 ## Median : 8.90 Median : 1.0456 Median :-0.2944 Median :-0.31757 ## Mean : 11.68 Mean : 1.8016 Mean : 0.0000 Mean : 0.00000 ## 3rd Qu.: 13.00 3rd Qu.: 1.8644 3rd Qu.: 0.1398 3rd Qu.: 0.02638 ## Max. :205.33 Max. :28.8069 Max. :20.5082 Max. :11.34387
sample_taxi_train %>% summarize(r = cor(fare_amount, CrowD), r.z= cor(fare.z, CrowD.z))
## r r.z ## 1 0.8581072 0.8581072
sample_taxi_train %>% filter(fare_amount >= 2.50, CrowD > 0.25) %>% ggplot(aes(CrowD.z, fare.z)) + geom_point(size=0.1) + geom_abline(slope=0.8581072, intercept=0, color="red")
\(y = m \cdot x + b\)
For the best fit line: \(y_z = r \cdot x_z\)
\(\frac{y - \mu_y}{\sigma_y} = r \cdot \frac{x - \mu_x}{\sigma_x}\)
\(y - \mu_y = r \cdot (x - \mu_x)\cdot\frac{\sigma_y}{\sigma_x}\)
\(y = r \cdot \frac{\sigma_y}{\sigma_x} x + (\mu_y- \mu_x \cdot r \cdot \frac{\sigma_y}{\sigma_x})\)
\(m = r \cdot \frac{\sigma_y}{\sigma_x}\)
\(b = \mu_y- \mu_x\cdot\frac{\sigma_y}{\sigma_x}\)
best_fit_slope <- function(x,y){ cor(x,y)*(sd(y)/sd(x)) } best_fit_intercept <- function(x,y){ mean(y) - mean(x)*cor(x,y)*(sd(y)/sd(x)) }
sample_taxi_train %>% summarize(m = best_fit_slope(CrowD, fare_amount), b = best_fit_intercept(CrowD, fare_amount))
## m b ## 1 3.403645 5.547617
lm(fare_amount ~ CrowD, data=sample_taxi_train)
## ## Call: ## lm(formula = fare_amount ~ CrowD, data = sample_taxi_train) ## ## Coefficients: ## (Intercept) CrowD ## 5.548 3.404
m <- lm(fare_amount ~ CrowD, data=sample_taxi_train) predict(m, test)