# LESSON 3
# multiple linear regression recap
# y = ax1 + bx2 + cx3 + ... + intercept
x1 = c(1,2,3,4,5)
x2 = c(2,4,8,20,1)
x3 = c(10,20,30,40,50)
y = c(7,14,19,16,44) # think of an intuitive model before using ml: x3-(x1+x2)
df=data.frame(x1,x2,x3,y)
mlrModel = lm(y~.,df)
summary(mlrModel) # observe the predicted ml model compared to the intuitive model
## Warning in summary.lm(mlrModel): essentially perfect fit: summary may be
## unreliable
##
## Call:
## lm(formula = y ~ ., data = df)
##
## Residuals:
## 1 2 3 4 5
## -3.026e-15 6.071e-15 -3.255e-15 3.999e-16 -1.901e-16
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.542e-14 5.679e-15 4.476e+00 0.0465 *
## x1 9.000e+00 1.759e-15 5.118e+15 <2e-16 ***
## x2 -1.000e+00 3.590e-16 -2.786e+15 <2e-16 ***
## x3 NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.329e-15 on 2 degrees of freedom
## Multiple R-squared: 1, Adjusted R-squared: 1
## F-statistic: 1.405e+31 on 2 and 2 DF, p-value: < 2.2e-16
predict(mlrModel,data.frame(x1=4,x2=20,x3=40))
## Warning in predict.lm(mlrModel, data.frame(x1 = 4, x2 = 20, x3 = 40)):
## prediction from a rank-deficient fit may be misleading
## 1
## 16
# MLR exercise
x1 = c(1,2,3,4,5)
x2 = c(2,4,8,20,1)
x3 = c(10,20,30,40,50)
x4 = c(200,400,800,2000,100)
y = c(187,374,759,1936,44) # think of an intuitive model before using ml: x4-(x1+x2+x3)
df=data.frame(x1,x2,x3,x4,y)
mlrModel = lm(y~.,df)
print(mlrModel) # observe the predicted ml model compared to intuitive model
##
## Call:
## lm(formula = y ~ ., data = df)
##
## Coefficients:
## (Intercept) x1 x2 x3 x4
## 0 -11 99 NA NA
predict(mlrModel,data.frame(x1=1,x2=2,x3=10,x4=200))
## Warning in predict.lm(mlrModel, data.frame(x1 = 1, x2 = 2, x3 = 10, x4 =
## 200)): prediction from a rank-deficient fit may be misleading
## 1
## 187
# MLR exercise on real world data to predict house prices
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
## Registered S3 methods overwritten by 'ggplot2':
## method from
## [.quosures rlang
## c.quosures rlang
## print.quosures rlang
data(Sacramento)
df = Sacramento
str(df) # look at the variable types
## 'data.frame': 932 obs. of 9 variables:
## $ city : Factor w/ 37 levels "ANTELOPE","AUBURN",..: 34 34 34 34 34 34 34 34 29 31 ...
## $ zip : Factor w/ 68 levels "z95603","z95608",..: 64 52 44 44 53 65 66 49 24 25 ...
## $ beds : int 2 3 2 2 2 3 3 3 2 3 ...
## $ baths : num 1 1 1 1 1 1 2 1 2 2 ...
## $ sqft : int 836 1167 796 852 797 1122 1104 1177 941 1146 ...
## $ type : Factor w/ 3 levels "Condo","Multi_Family",..: 3 3 3 3 3 1 3 3 1 3 ...
## $ price : int 59222 68212 68880 69307 81900 89921 90895 91002 94905 98937 ...
## $ latitude : num 38.6 38.5 38.6 38.6 38.5 ...
## $ longitude: num -121 -121 -121 -121 -121 ...
head(df)
## city zip beds baths sqft type price latitude longitude
## 1 SACRAMENTO z95838 2 1 836 Residential 59222 38.63191 -121.4349
## 2 SACRAMENTO z95823 3 1 1167 Residential 68212 38.47890 -121.4310
## 3 SACRAMENTO z95815 2 1 796 Residential 68880 38.61830 -121.4438
## 4 SACRAMENTO z95815 2 1 852 Residential 69307 38.61684 -121.4391
## 5 SACRAMENTO z95824 2 1 797 Residential 81900 38.51947 -121.4358
## 6 SACRAMENTO z95841 3 1 1122 Condo 89921 38.66260 -121.3278
# convert non numeric types to numeric to use mlr
df$city=as.numeric(df$city)
df$type=as.numeric(df$type)
df$zip=as.numeric(substr(df$zip,2,length(df$zip)))
# split data between training and test
df$id=1:nrow(df)
n = nrow(df)
trainIndex = sample(1:n, size = round(0.8*n), replace=FALSE)
train = df[trainIndex ,]
test = df[-trainIndex ,]
mlrModel = lm(price~city+zip+beds+baths+sqft+type+latitude+longitude,train)
summary(mlrModel) # observe R sq that explains % of variability/strength of the predicted model.
##
## Call:
## lm(formula = price ~ city + zip + beds + baths + sqft + type +
## latitude + longitude, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -240359 -47877 -12267 35515 541564
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.833e+07 4.672e+06 3.923 9.58e-05 ***
## city -5.824e+01 2.999e+02 -0.194 0.846065
## zip -9.210e+01 4.570e+01 -2.015 0.044224 *
## beds -2.727e+04 5.101e+03 -5.346 1.20e-07 ***
## baths 1.419e+04 6.372e+03 2.228 0.026208 *
## sqft 1.388e+02 7.287e+00 19.052 < 2e-16 ***
## type 2.427e+04 6.560e+03 3.700 0.000232 ***
## latitude 6.759e+04 2.361e+04 2.863 0.004313 **
## longitude 9.980e+04 2.732e+04 3.654 0.000277 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 78370 on 737 degrees of freedom
## Multiple R-squared: 0.6187, Adjusted R-squared: 0.6145
## F-statistic: 149.5 on 8 and 737 DF, p-value: < 2.2e-16
# The closer to 1, the better. If it is low, it implies some other variables are missing from the data that should have been.
# Feature engineering: by removing non important or one of the correlated features
cor(train) # see if two variables are correlated and remove the less important of the two
## city zip beds baths sqft
## city 1.000000000 0.568773456 -0.13223511 -0.224465403 -0.19588776
## zip 0.568773456 1.000000000 -0.08006091 -0.141525777 -0.15031979
## beds -0.132235115 -0.080060905 1.00000000 0.664006144 0.72842460
## baths -0.224465403 -0.141525777 0.66400614 1.000000000 0.75894338
## sqft -0.195887756 -0.150319795 0.72842460 0.758943379 1.00000000
## type 0.008384098 0.009062949 0.31504323 0.126202622 0.20243840
## price -0.224750709 -0.246277587 0.45967858 0.580553108 0.74867942
## latitude -0.022319940 -0.167796754 -0.05120473 0.044494588 0.06062313
## longitude -0.366731378 -0.537258849 0.07765542 0.178551048 0.23930152
## id -0.012472802 -0.047594510 0.01255574 -0.003454384 0.03664664
## type price latitude longitude id
## city 0.008384098 -0.22475071 -0.02231994 -0.366731378 -0.012472802
## zip 0.009062949 -0.24627759 -0.16779675 -0.537258849 -0.047594510
## beds 0.315043230 0.45967858 -0.05120473 0.077655416 0.012555744
## baths 0.126202622 0.58055311 0.04449459 0.178551048 -0.003454384
## sqft 0.202438404 0.74867942 0.06062313 0.239301521 0.036646636
## type 1.000000000 0.18775868 -0.08990775 0.001127325 0.094280504
## price 0.187758680 1.00000000 0.17690604 0.355663715 0.079518030
## latitude -0.089907753 0.17690604 1.00000000 0.389586783 0.087499908
## longitude 0.001127325 0.35566372 0.38958678 1.000000000 0.029411611
## id 0.094280504 0.07951803 0.08749991 0.029411611 1.000000000
# or observe visually that variables are not correlated to each other.
psych::pairs.panels(train,
method = "pearson", # correlation method
hist.col = "#00AFBB",
density = TRUE, # show density plots
ellipses = TRUE # show correlation ellipses
) # or visually

summary(mlrModel) # look at model summary for * and p-values to find the most important features.
##
## Call:
## lm(formula = price ~ city + zip + beds + baths + sqft + type +
## latitude + longitude, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -240359 -47877 -12267 35515 541564
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.833e+07 4.672e+06 3.923 9.58e-05 ***
## city -5.824e+01 2.999e+02 -0.194 0.846065
## zip -9.210e+01 4.570e+01 -2.015 0.044224 *
## beds -2.727e+04 5.101e+03 -5.346 1.20e-07 ***
## baths 1.419e+04 6.372e+03 2.228 0.026208 *
## sqft 1.388e+02 7.287e+00 19.052 < 2e-16 ***
## type 2.427e+04 6.560e+03 3.700 0.000232 ***
## latitude 6.759e+04 2.361e+04 2.863 0.004313 **
## longitude 9.980e+04 2.732e+04 3.654 0.000277 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 78370 on 737 degrees of freedom
## Multiple R-squared: 0.6187, Adjusted R-squared: 0.6145
## F-statistic: 149.5 on 8 and 737 DF, p-value: < 2.2e-16
(v=varImp(mlrModel)) # analyze variable importance values
## Overall
## city 0.1942101
## zip 2.0154134
## beds 5.3458470
## baths 2.2276069
## sqft 19.0522170
## type 3.6998605
## latitude 2.8631652
## longitude 3.6535089
# or observe visually
v$var = rownames(v)
library(ggvis)
##
## Attaching package: 'ggvis'
## The following object is masked from 'package:ggplot2':
##
## resolution
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
v %>% ggvis(~var, ~Overall) %>% layer_bars()
# now retrain after feature engineering done on fewer variables
mlrModel = lm(price~beds+sqft+type+longitude,train)
summary(mlrModel) # compare R sq which should be similar as before, means we still have a strong model with reduced features
##
## Call:
## lm(formula = price ~ beds + sqft + type + longitude, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -256548 -50294 -11726 35921 579778
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.929e+07 2.662e+06 7.247 1.07e-12 ***
## beds -2.465e+04 4.946e+03 -4.984 7.77e-07 ***
## sqft 1.479e+02 6.293e+00 23.505 < 2e-16 ***
## type 2.078e+04 6.562e+03 3.167 0.0016 **
## longitude 1.588e+05 2.193e+04 7.241 1.12e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 79170 on 741 degrees of freedom
## Multiple R-squared: 0.6088, Adjusted R-squared: 0.6067
## F-statistic: 288.3 on 4 and 741 DF, p-value: < 2.2e-16
# now analyze the predictions on the original training data set to see how accurate the model is
predictHousePrice = round(predict(mlrModel, type = "response")) # predict house prices on each row of the training data
df_predicted = cbind(train,predictHousePrice) # add this as a column to the training data and put in a new df
df_predicted$pricediff= round ((df_predicted$price - df_predicted$predictHousePrice)/df_predicted$price) # add a column for the difference between predicted and actual price and view the differences
# plot the actual and predicted values
df_predicted_plot=df_predicted %>% select(id, price, predictHousePrice)
library(googleVis)
## Creating a generic function for 'toJSON' from package 'jsonlite' in package 'googleVis'
##
## Welcome to googleVis version 0.6.4
##
## Please read Google's Terms of Use
## before you start using the package:
## https://developers.google.com/terms/
##
## Note, the plot method of googleVis will by default use
## the standard browser to display its output.
##
## See the googleVis package vignettes for more details,
## or visit https://github.com/mages/googleVis.
##
## To suppress this message use:
## suppressPackageStartupMessages(library(googleVis))
plot(gvisBarChart(df_predicted_plot[1:100,], options=list(height=6000))) # actual first 100 row values
## starting httpd help server ...
## done
ggplot(df_predicted, aes(x=pricediff)) + geom_histogram(bins=4) # histogram distribution on the difference

# Homework assignment for next two weeks - we will set aside 1 hour at the end of the class to discuss 1:1.
# Final project proposal using R Markdown or R presentation framework.
# Provide the following inputs in your proposal:
# What problem you're trying to solve?
# What you're trying to predict?
# Why is this relevant or useful?
# What ground truth data do you have?
# What are the variables and what is the label?
# How is the data quality?
# Do you need to prepare the data?
# How do you plan to conduct the exploratory analysis?
# What algorithm(s) do you plan to use and why?
# Train and test the model
# Provide performance metrics
# What could you have done differently or better?
# Conclusion
# Note: If you can't come up with a problem, then use this data to predict house price in King county: https://www.kaggle.com/harlfoxem/housesalesprediction