# LESSON 4
# Recap: MLR exercise on real world data to predict house prices
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
## Registered S3 methods overwritten by 'ggplot2':
## method from
## [.quosures rlang
## c.quosures rlang
## print.quosures rlang
data(Sacramento)
df = Sacramento
str(df) # look at the variable types
## 'data.frame': 932 obs. of 9 variables:
## $ city : Factor w/ 37 levels "ANTELOPE","AUBURN",..: 34 34 34 34 34 34 34 34 29 31 ...
## $ zip : Factor w/ 68 levels "z95603","z95608",..: 64 52 44 44 53 65 66 49 24 25 ...
## $ beds : int 2 3 2 2 2 3 3 3 2 3 ...
## $ baths : num 1 1 1 1 1 1 2 1 2 2 ...
## $ sqft : int 836 1167 796 852 797 1122 1104 1177 941 1146 ...
## $ type : Factor w/ 3 levels "Condo","Multi_Family",..: 3 3 3 3 3 1 3 3 1 3 ...
## $ price : int 59222 68212 68880 69307 81900 89921 90895 91002 94905 98937 ...
## $ latitude : num 38.6 38.5 38.6 38.6 38.5 ...
## $ longitude: num -121 -121 -121 -121 -121 ...
head(df)
## city zip beds baths sqft type price latitude longitude
## 1 SACRAMENTO z95838 2 1 836 Residential 59222 38.63191 -121.4349
## 2 SACRAMENTO z95823 3 1 1167 Residential 68212 38.47890 -121.4310
## 3 SACRAMENTO z95815 2 1 796 Residential 68880 38.61830 -121.4438
## 4 SACRAMENTO z95815 2 1 852 Residential 69307 38.61684 -121.4391
## 5 SACRAMENTO z95824 2 1 797 Residential 81900 38.51947 -121.4358
## 6 SACRAMENTO z95841 3 1 1122 Condo 89921 38.66260 -121.3278
# convert non numeric types to numeric to use mlr
df$city=as.numeric(df$city)
df$type=as.numeric(df$type)
df$zip=as.numeric(substr(df$zip,2,length(df$zip)))
# split data between training and test
df$id=1:nrow(df)
n = nrow(df)
trainIndex = sample(1:n, size = round(0.8*n), replace=FALSE)
train = df[trainIndex ,]
test = df[-trainIndex ,]
# now retrain after feature engineering done on fewer variables
mlrModel = lm(price~beds+sqft+type+longitude,train)
summary(mlrModel) # compare R sq which should be similar as before, means we still have a strong model with reduced features
##
## Call:
## lm(formula = price ~ beds + sqft + type + longitude, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -260325 -48471 -8877 36302 392465
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.061e+07 2.472e+06 8.338 3.67e-16 ***
## beds -2.546e+04 4.564e+03 -5.577 3.43e-08 ***
## sqft 1.476e+02 5.539e+00 26.653 < 2e-16 ***
## type 1.736e+04 6.139e+03 2.828 0.00482 **
## longitude 1.696e+05 2.036e+04 8.329 3.94e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 74890 on 741 degrees of freedom
## Multiple R-squared: 0.6622, Adjusted R-squared: 0.6604
## F-statistic: 363.2 on 4 and 741 DF, p-value: < 2.2e-16
# predict on training data
predictHousePrice = round(predict(mlrModel, newdata = train, type = "response")) # predict house prices on each row of the training data
df_predicted_train = cbind(train,predictHousePrice) # add this as a column to the training data and put in a new df
df_predicted_train$pricediff=
round ((df_predicted_train$price - df_predicted_train$predictHousePrice)/
df_predicted_train$price) # add a column for the difference between predicted and actual price and view the differences
# plot the actual and predicted values
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
df_predicted_plot=df_predicted_train %>% select(id, price, predictHousePrice)
library(googleVis)
## Creating a generic function for 'toJSON' from package 'jsonlite' in package 'googleVis'
##
## Welcome to googleVis version 0.6.4
##
## Please read Google's Terms of Use
## before you start using the package:
## https://developers.google.com/terms/
##
## Note, the plot method of googleVis will by default use
## the standard browser to display its output.
##
## See the googleVis package vignettes for more details,
## or visit https://github.com/mages/googleVis.
##
## To suppress this message use:
## suppressPackageStartupMessages(library(googleVis))
plot(gvisBarChart(df_predicted_plot[1:100,], options=list(height=6000))) # actual first 100 row values
## starting httpd help server ...
## done
ggplot(df_predicted_train, aes(x=pricediff)) + geom_histogram(bins=4) # histogram distribution on the difference

# predict on test data
predictHousePrice = round(predict(mlrModel, newdata = test, type = "response")) # predict house prices on each row of the training data
df_predicted_test = cbind(test,predictHousePrice) # add this as a column to the training data and put in a new df
df_predicted_test$pricediff= round ((df_predicted_test$price - df_predicted_test$predictHousePrice)/df_predicted_test$price) # add a column for the difference between predicted and actual price and view the differences
# plot the actual and predicted values
library(dplyr)
df_predicted_plot=df_predicted_test %>% select(id, price, predictHousePrice)
library(googleVis)
plot(gvisBarChart(df_predicted_plot[1:100,], options=list(height=6000))) # actual first 100 row values
ggplot(df_predicted_test, aes(x=pricediff)) + geom_histogram(bins=4) # histogram distribution on the difference

# simpler way to see how well we trained
library(broom)
(m = tidy(mlrModel))
## # A tibble: 5 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 20613861. 2472288. 8.34 3.67e- 16
## 2 beds -25456. 4564. -5.58 3.43e- 8
## 3 sqft 148. 5.54 26.7 2.80e-110
## 4 type 17359. 6139. 2.83 4.82e- 3
## 5 longitude 169612. 20365. 8.33 3.94e- 16
m$estimate # all coefficients accessed via a data frame
## [1] 20613861.2245 -25456.3547 147.6437 17359.2603 169612.1812
varImp(mlrModel)
## Overall
## beds 5.577102
## sqft 26.653248
## type 2.827543
## longitude 8.328741
(a = augment(mlrModel)) # reduces several lines of code into one
## # A tibble: 746 x 13
## .rownames price beds sqft type longitude .fitted .se.fit .resid
## <chr> <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 152 510000 5 3508 3 -121. 460917. 7898. 4.91e4
## 2 23 123000 4 1240 3 -121. 159290. 5873. -3.63e4
## 3 223 200000 3 1393 3 -121. 210912. 3123. -1.09e4
## 4 511 360000 2 1119 1 -122. 136844. 11664. 2.23e5
## 5 789 315000 4 2212 3 -121. 316937. 3689. -1.94e3
## 6 852 575000 4 3229 3 -121. 486047. 7137. 8.90e4
## 7 875 95000 3 1058 3 -121. 142551. 4164. -4.76e4
## 8 876 97750 3 1040 3 -121. 144377. 4055. -4.66e4
## 9 657 168750 3 1669 3 -121. 250011. 3179. -8.13e4
## 10 11 100309 3 909 3 -121. 124121. 4504. -2.38e4
## # … with 736 more rows, and 4 more variables: .hat <dbl>, .sigma <dbl>,
## # .cooksd <dbl>, .std.resid <dbl>
head(df_predicted_train) # compare with the fitted values
## city zip beds baths sqft type price latitude longitude id
## 152 10 95757 5 4 3508 3 510000 38.38253 -121.4280 152
## 23 24 95660 4 2 1240 3 123000 38.70279 -121.3822 23
## 223 1 95843 3 2 1393 3 200000 38.70441 -121.3611 223
## 511 34 95816 2 2 1119 1 360000 38.53805 -121.5047 511
## 789 33 95678 4 3 2212 3 315000 38.77587 -121.2989 741
## 852 13 95630 4 3 3229 3 575000 38.70396 -121.1871 804
## predictHousePrice pricediff
## 152 460917 0
## 23 159290 0
## 223 210912 0
## 511 136844 1
## 789 316937 0
## 852 486047 0
# Log Regression - use when outcome is discrete instead of continuous (linear regression).
# prep data for ML
library(dplyr)
titanic = read.csv("https://goo.gl/At238b")
str(titanic) # describe values and variable types, notice NA values in age
## 'data.frame': 1309 obs. of 14 variables:
## $ pclass : Factor w/ 1 level "1st": 1 1 1 1 1 1 1 1 1 1 ...
## $ survived : int 1 1 0 0 0 1 1 0 1 0 ...
## $ name : Factor w/ 1307 levels "Abbing, Mr. Anthony",..: 22 24 25 26 27 31 46 47 51 55 ...
## $ sex : Factor w/ 2 levels "female","male": 1 2 1 2 1 2 1 2 1 2 ...
## $ age : num 29 0.92 2 30 25 48 63 39 53 71 ...
## $ sibsp : int 0 1 1 1 1 0 1 0 2 0 ...
## $ parch : int 0 2 2 2 2 0 0 0 0 0 ...
## $ ticket : Factor w/ 929 levels "110152","110413",..: 188 50 50 50 50 125 93 16 77 826 ...
## $ fare : num 211 152 152 152 152 ...
## $ cabin : Factor w/ 186 levels "A10","A11","A14",..: 44 80 80 80 80 150 146 16 62 NA ...
## $ embarked : Factor w/ 3 levels "C","Q","S": 3 3 3 3 3 3 3 3 3 1 ...
## $ boat : Factor w/ 27 levels "1","10","11",..: 12 3 NA NA NA 13 2 NA 27 NA ...
## $ body : int NA NA NA 135 NA NA NA NA NA 22 ...
## $ home.dest: Factor w/ 369 levels "?Havana, Cuba",..: 309 231 231 231 231 237 162 24 22 229 ...
nrow(titanic)
## [1] 1309
head(titanic)
## pclass survived name sex
## 1 1st 1 Allen, Miss. Elisabeth Walton female
## 2 1st 1 Allison, Master. Hudson Trevor male
## 3 1st 0 Allison, Miss. Helen Loraine female
## 4 1st 0 Allison, Mr. Hudson Joshua Creighton male
## 5 1st 0 Allison, Mrs. Hudson J C (Bessie Waldo Daniels) female
## 6 1st 1 Anderson, Mr. Harry male
## age sibsp parch ticket fare cabin embarked boat body
## 1 29.00 0 0 24160 211.3375 B5 S 2 NA
## 2 0.92 1 2 113781 151.5500 C22 C26 S 11 NA
## 3 2.00 1 2 113781 151.5500 C22 C26 S <NA> NA
## 4 30.00 1 2 113781 151.5500 C22 C26 S <NA> 135
## 5 25.00 1 2 113781 151.5500 C22 C26 S <NA> NA
## 6 48.00 0 0 19952 26.5500 E12 S 3 NA
## home.dest
## 1 St Louis, MO
## 2 Montreal, PQ / Chesterville, ON
## 3 Montreal, PQ / Chesterville, ON
## 4 Montreal, PQ / Chesterville, ON
## 5 Montreal, PQ / Chesterville, ON
## 6 New York, NY
titanic = titanic %>% filter(age>0) %>% select(-body, -boat, -cabin, -home.dest) # remove non useful variables
titanic[!complete.cases(titanic),] # check if na values
## pclass survived name sex age
## 149 1st 1 Icard, Miss. Amelie female 38.0
## 250 1st 1 Stone, Mrs. George Nelson (Martha Evelyn) female 62.0
## 985 1st 0 Storey, Mr. Thomas male 60.5
## sibsp parch ticket fare embarked
## 149 0 0 113572 80 <NA>
## 250 0 0 113572 80 <NA>
## 985 0 0 3701 NA S
titanic = na.omit(titanic) # remove na values
titanic[!complete.cases(titanic),] # verify no na values
## [1] pclass survived name sex age sibsp parch
## [8] ticket fare embarked
## <0 rows> (or 0-length row.names)
# first analyze data intuitively
table(titanic$survived) # Survival rates in absolute numbers
##
## 0 1
## 618 425
table(titanic$sex, titanic$survived) # distribution of sexes
##
## 0 1
## female 96 290
## male 522 135
ggplot(titanic, aes(x=factor(survived),fill=factor(sex))) + geom_bar() # distribution of sexes

# convert sex to a number
titanic$sex = as.numeric(titanic$sex)
titanic$embarked = as.numeric(titanic$embarked)
# select relevant variables to train
titanic = titanic %>% select(sex, age, sibsp, parch, fare, survived, embarked)
head(titanic) # our data set is ready for ML
## sex age sibsp parch fare survived embarked
## 1 1 29.00 0 0 211.3375 1 3
## 2 2 0.92 1 2 151.5500 1 3
## 3 1 2.00 1 2 151.5500 0 3
## 4 2 30.00 1 2 151.5500 0 3
## 5 1 25.00 1 2 151.5500 0 3
## 6 2 48.00 0 0 26.5500 1 3
(n = nrow(titanic))
## [1] 1043
trainIndex = sample(1:n, size = round(0.7*n), replace=FALSE)
train = titanic[trainIndex ,]
test = titanic[-trainIndex ,]
# create the model
lrModel = glm(survived ~ ., data = train, family = binomial)
print(lrModel)
##
## Call: glm(formula = survived ~ ., family = binomial, data = train)
##
## Coefficients:
## (Intercept) sex age sibsp parch
## 4.584117 -2.312028 -0.012818 -0.452349 0.040133
## fare embarked
## 0.009282 -0.384708
##
## Degrees of Freedom: 729 Total (i.e. Null); 723 Residual
## Null Deviance: 987.3
## Residual Deviance: 739 AIC: 753
predictSurvived = predict(lrModel, newdata = test, type = "response") # predicts values between 0 and 1
predictSurvived = ifelse(predictSurvived>=0.5,1,0) # use this function to predict an absolute 1 or 0
# another way to get metrics on test data predictions
library(MLmetrics)
##
## Attaching package: 'MLmetrics'
## The following objects are masked from 'package:caret':
##
## MAE, RMSE
## The following object is masked from 'package:base':
##
## Recall
AUC(y_pred = predictSurvived, y_true = test$survived)
## [1] 0.7856871
Accuracy(y_pred = predictSurvived, y_true = test$survived)
## [1] 0.798722
Recall(y_pred = predictSurvived, y_true = test$survived)
## [1] 0.8548387
Precision(y_pred = predictSurvived, y_true = test$survived)
## [1] 0.8153846
F1_Score(y_pred = predictSurvived, y_true = test$survived)
## [1] 0.8346457
ConfusionMatrix(y_pred = predictSurvived, y_true = test$survived)
## y_pred
## y_true 0 1
## 0 159 27
## 1 36 91
# or this
library(caret)
confusionMatrix(factor(predictSurvived), factor(test$survived))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 159 36
## 1 27 91
##
## Accuracy : 0.7987
## 95% CI : (0.75, 0.8417)
## No Information Rate : 0.5942
## P-Value [Acc > NIR] : 1.047e-14
##
## Kappa : 0.5779
##
## Mcnemar's Test P-Value : 0.3135
##
## Sensitivity : 0.8548
## Specificity : 0.7165
## Pos Pred Value : 0.8154
## Neg Pred Value : 0.7712
## Prevalence : 0.5942
## Detection Rate : 0.5080
## Detection Prevalence : 0.6230
## Balanced Accuracy : 0.7857
##
## 'Positive' Class : 0
##
# ROCR and PR curves
library(precrec)
precrec_obj <- evalmod(labels = test$survived, scores = predictSurvived)
autoplot(precrec_obj)

# Exercise for home - for the Titanic example, retrain the model after removing features not important