lesson3.utf8.md

# LESSON 3
# multiple linear regression recap
# y = ax1 + bx2 + cx3 + ... + intercept
x1 = c(1,2,3,4,5)
x2 = c(2,4,8,20,1)
x3 = c(10,20,30,40,50)
y = c(7,14,19,16,44) # think of an intuitive model before using ml: x3-(x1+x2)
df=data.frame(x1,x2,x3,y)
mlrModel = lm(y~.,df)
summary(mlrModel) # observe the predicted ml model compared to the intuitive model

## Warning in summary.lm(mlrModel): essentially perfect fit: summary may be
## unreliable

## 
## Call:
## lm(formula = y ~ ., data = df)
## 
## Residuals:
##          1          2          3          4          5 
## -3.026e-15  6.071e-15 -3.255e-15  3.999e-16 -1.901e-16 
## 
## Coefficients: (1 not defined because of singularities)
##               Estimate Std. Error    t value Pr(>|t|)    
## (Intercept)  2.542e-14  5.679e-15  4.476e+00   0.0465 *  
## x1           9.000e+00  1.759e-15  5.118e+15   <2e-16 ***
## x2          -1.000e+00  3.590e-16 -2.786e+15   <2e-16 ***
## x3                  NA         NA         NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.329e-15 on 2 degrees of freedom
## Multiple R-squared:      1,  Adjusted R-squared:      1 
## F-statistic: 1.405e+31 on 2 and 2 DF,  p-value: < 2.2e-16

predict(mlrModel,data.frame(x1=4,x2=20,x3=40))

## Warning in predict.lm(mlrModel, data.frame(x1 = 4, x2 = 20, x3 = 40)):
## prediction from a rank-deficient fit may be misleading

##  1 
## 16

# MLR exercise
x1 = c(1,2,3,4,5)
x2 = c(2,4,8,20,1)
x3 = c(10,20,30,40,50)
x4 = c(200,400,800,2000,100)
y = c(187,374,759,1936,44) # think of an intuitive model before using ml: x4-(x1+x2+x3)
df=data.frame(x1,x2,x3,x4,y)
mlrModel = lm(y~.,df)
print(mlrModel) # observe the predicted ml model compared to intuitive model

## 
## Call:
## lm(formula = y ~ ., data = df)
## 
## Coefficients:
## (Intercept)           x1           x2           x3           x4  
##           0          -11           99           NA           NA

predict(mlrModel,data.frame(x1=1,x2=2,x3=10,x4=200))

## Warning in predict.lm(mlrModel, data.frame(x1 = 1, x2 = 2, x3 = 10, x4 =
## 200)): prediction from a rank-deficient fit may be misleading

##   1 
## 187

# MLR exercise on real world data to predict house prices
library(caret)

## Loading required package: lattice

## Loading required package: ggplot2

## Registered S3 methods overwritten by 'ggplot2':
##   method         from 
##   [.quosures     rlang
##   c.quosures     rlang
##   print.quosures rlang

data(Sacramento)
df = Sacramento
str(df) # look at the variable types

## 'data.frame':    932 obs. of  9 variables:
##  $ city     : Factor w/ 37 levels "ANTELOPE","AUBURN",..: 34 34 34 34 34 34 34 34 29 31 ...
##  $ zip      : Factor w/ 68 levels "z95603","z95608",..: 64 52 44 44 53 65 66 49 24 25 ...
##  $ beds     : int  2 3 2 2 2 3 3 3 2 3 ...
##  $ baths    : num  1 1 1 1 1 1 2 1 2 2 ...
##  $ sqft     : int  836 1167 796 852 797 1122 1104 1177 941 1146 ...
##  $ type     : Factor w/ 3 levels "Condo","Multi_Family",..: 3 3 3 3 3 1 3 3 1 3 ...
##  $ price    : int  59222 68212 68880 69307 81900 89921 90895 91002 94905 98937 ...
##  $ latitude : num  38.6 38.5 38.6 38.6 38.5 ...
##  $ longitude: num  -121 -121 -121 -121 -121 ...

head(df)

##         city    zip beds baths sqft        type price latitude longitude
## 1 SACRAMENTO z95838    2     1  836 Residential 59222 38.63191 -121.4349
## 2 SACRAMENTO z95823    3     1 1167 Residential 68212 38.47890 -121.4310
## 3 SACRAMENTO z95815    2     1  796 Residential 68880 38.61830 -121.4438
## 4 SACRAMENTO z95815    2     1  852 Residential 69307 38.61684 -121.4391
## 5 SACRAMENTO z95824    2     1  797 Residential 81900 38.51947 -121.4358
## 6 SACRAMENTO z95841    3     1 1122       Condo 89921 38.66260 -121.3278

# convert non numeric types to numeric to use mlr
df$city=as.numeric(df$city)
df$type=as.numeric(df$type)
df$zip=as.numeric(substr(df$zip,2,length(df$zip)))

# split data between training and test
df$id=1:nrow(df)
n = nrow(df)
trainIndex = sample(1:n, size = round(0.8*n), replace=FALSE)
train = df[trainIndex ,]
test = df[-trainIndex ,]

mlrModel = lm(price~city+zip+beds+baths+sqft+type+latitude+longitude,train)
summary(mlrModel) # observe R sq that explains % of variability/strength of the predicted model.

## 
## Call:
## lm(formula = price ~ city + zip + beds + baths + sqft + type + 
##     latitude + longitude, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -240359  -47877  -12267   35515  541564 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.833e+07  4.672e+06   3.923 9.58e-05 ***
## city        -5.824e+01  2.999e+02  -0.194 0.846065    
## zip         -9.210e+01  4.570e+01  -2.015 0.044224 *  
## beds        -2.727e+04  5.101e+03  -5.346 1.20e-07 ***
## baths        1.419e+04  6.372e+03   2.228 0.026208 *  
## sqft         1.388e+02  7.287e+00  19.052  < 2e-16 ***
## type         2.427e+04  6.560e+03   3.700 0.000232 ***
## latitude     6.759e+04  2.361e+04   2.863 0.004313 ** 
## longitude    9.980e+04  2.732e+04   3.654 0.000277 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 78370 on 737 degrees of freedom
## Multiple R-squared:  0.6187, Adjusted R-squared:  0.6145 
## F-statistic: 149.5 on 8 and 737 DF,  p-value: < 2.2e-16

# The closer to 1, the better. If it is low, it implies some other variables are missing from the data that should have been.


# Feature engineering: by removing non important or one of the correlated features
cor(train) # see if two variables are correlated and remove the less important of the two

##                   city          zip        beds        baths        sqft
## city       1.000000000  0.568773456 -0.13223511 -0.224465403 -0.19588776
## zip        0.568773456  1.000000000 -0.08006091 -0.141525777 -0.15031979
## beds      -0.132235115 -0.080060905  1.00000000  0.664006144  0.72842460
## baths     -0.224465403 -0.141525777  0.66400614  1.000000000  0.75894338
## sqft      -0.195887756 -0.150319795  0.72842460  0.758943379  1.00000000
## type       0.008384098  0.009062949  0.31504323  0.126202622  0.20243840
## price     -0.224750709 -0.246277587  0.45967858  0.580553108  0.74867942
## latitude  -0.022319940 -0.167796754 -0.05120473  0.044494588  0.06062313
## longitude -0.366731378 -0.537258849  0.07765542  0.178551048  0.23930152
## id        -0.012472802 -0.047594510  0.01255574 -0.003454384  0.03664664
##                   type       price    latitude    longitude           id
## city       0.008384098 -0.22475071 -0.02231994 -0.366731378 -0.012472802
## zip        0.009062949 -0.24627759 -0.16779675 -0.537258849 -0.047594510
## beds       0.315043230  0.45967858 -0.05120473  0.077655416  0.012555744
## baths      0.126202622  0.58055311  0.04449459  0.178551048 -0.003454384
## sqft       0.202438404  0.74867942  0.06062313  0.239301521  0.036646636
## type       1.000000000  0.18775868 -0.08990775  0.001127325  0.094280504
## price      0.187758680  1.00000000  0.17690604  0.355663715  0.079518030
## latitude  -0.089907753  0.17690604  1.00000000  0.389586783  0.087499908
## longitude  0.001127325  0.35566372  0.38958678  1.000000000  0.029411611
## id         0.094280504  0.07951803  0.08749991  0.029411611  1.000000000

# or observe visually that variables are not correlated to each other. 
psych::pairs.panels(train, 
                    method = "pearson", # correlation method
                    hist.col = "#00AFBB",
                    density = TRUE,  # show density plots
                    ellipses = TRUE # show correlation ellipses
) # or visually

summary(mlrModel) # look at model summary for * and p-values to find the most important features.

## 
## Call:
## lm(formula = price ~ city + zip + beds + baths + sqft + type + 
##     latitude + longitude, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -240359  -47877  -12267   35515  541564 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.833e+07  4.672e+06   3.923 9.58e-05 ***
## city        -5.824e+01  2.999e+02  -0.194 0.846065    
## zip         -9.210e+01  4.570e+01  -2.015 0.044224 *  
## beds        -2.727e+04  5.101e+03  -5.346 1.20e-07 ***
## baths        1.419e+04  6.372e+03   2.228 0.026208 *  
## sqft         1.388e+02  7.287e+00  19.052  < 2e-16 ***
## type         2.427e+04  6.560e+03   3.700 0.000232 ***
## latitude     6.759e+04  2.361e+04   2.863 0.004313 ** 
## longitude    9.980e+04  2.732e+04   3.654 0.000277 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 78370 on 737 degrees of freedom
## Multiple R-squared:  0.6187, Adjusted R-squared:  0.6145 
## F-statistic: 149.5 on 8 and 737 DF,  p-value: < 2.2e-16

(v=varImp(mlrModel)) # analyze variable importance values

##              Overall
## city       0.1942101
## zip        2.0154134
## beds       5.3458470
## baths      2.2276069
## sqft      19.0522170
## type       3.6998605
## latitude   2.8631652
## longitude  3.6535089

# or observe visually
v$var = rownames(v)
library(ggvis)

## 
## Attaching package: 'ggvis'

## The following object is masked from 'package:ggplot2':
## 
##     resolution

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

v %>% ggvis(~var, ~Overall) %>% layer_bars()

# now retrain after feature engineering done on fewer variables
mlrModel = lm(price~beds+sqft+type+longitude,train) 
summary(mlrModel) # compare R sq which should be similar as before, means we still have a strong model with reduced features

## 
## Call:
## lm(formula = price ~ beds + sqft + type + longitude, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -256548  -50294  -11726   35921  579778 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.929e+07  2.662e+06   7.247 1.07e-12 ***
## beds        -2.465e+04  4.946e+03  -4.984 7.77e-07 ***
## sqft         1.479e+02  6.293e+00  23.505  < 2e-16 ***
## type         2.078e+04  6.562e+03   3.167   0.0016 ** 
## longitude    1.588e+05  2.193e+04   7.241 1.12e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 79170 on 741 degrees of freedom
## Multiple R-squared:  0.6088, Adjusted R-squared:  0.6067 
## F-statistic: 288.3 on 4 and 741 DF,  p-value: < 2.2e-16

# now analyze the predictions on the original training data set to see how accurate the model is
predictHousePrice = round(predict(mlrModel, type = "response")) # predict house prices on each row of the training data
df_predicted = cbind(train,predictHousePrice) # add this as a column to the training data and put in a new df
df_predicted$pricediff= round ((df_predicted$price - df_predicted$predictHousePrice)/df_predicted$price) # add a column for the difference between predicted and actual price and view the differences

# plot the actual and predicted values 
df_predicted_plot=df_predicted %>% select(id, price, predictHousePrice) 
library(googleVis)

## Creating a generic function for 'toJSON' from package 'jsonlite' in package 'googleVis'

## 
## Welcome to googleVis version 0.6.4
## 
## Please read Google's Terms of Use
## before you start using the package:
## https://developers.google.com/terms/
## 
## Note, the plot method of googleVis will by default use
## the standard browser to display its output.
## 
## See the googleVis package vignettes for more details,
## or visit https://github.com/mages/googleVis.
## 
## To suppress this message use:
## suppressPackageStartupMessages(library(googleVis))

plot(gvisBarChart(df_predicted_plot[1:100,], options=list(height=6000))) # actual first 100 row values

## starting httpd help server ...

##  done

ggplot(df_predicted, aes(x=pricediff)) + geom_histogram(bins=4) # histogram distribution on the difference

# Homework assignment for next two weeks - we will set aside 1 hour at the end of the class to discuss 1:1.
# Final project proposal using R Markdown or R presentation framework.
# Provide the following inputs in your proposal:
# What problem you're trying to solve?
# What you're trying to predict? 
# Why is this relevant or useful?
# What ground truth data do you have?
# What are the variables and what is the label?
# How is the data quality?
# Do you need to prepare the data?
# How do you plan to conduct the exploratory analysis?
# What algorithm(s) do you plan to use and why?
# Train and test the model
# Provide performance metrics
# What could you have done differently or better?
# Conclusion
# Note: If you can't come up with a problem, then use this data to predict house price in King county: https://www.kaggle.com/harlfoxem/housesalesprediction

lesson3.R

sangalp

2019-05-23