lesson4.utf8.md

# LESSON 4

# Recap: MLR exercise on real world data to predict house prices
library(caret)

## Loading required package: lattice

## Loading required package: ggplot2

## Registered S3 methods overwritten by 'ggplot2':
##   method         from 
##   [.quosures     rlang
##   c.quosures     rlang
##   print.quosures rlang

data(Sacramento)
df = Sacramento
str(df) # look at the variable types

## 'data.frame':    932 obs. of  9 variables:
##  $ city     : Factor w/ 37 levels "ANTELOPE","AUBURN",..: 34 34 34 34 34 34 34 34 29 31 ...
##  $ zip      : Factor w/ 68 levels "z95603","z95608",..: 64 52 44 44 53 65 66 49 24 25 ...
##  $ beds     : int  2 3 2 2 2 3 3 3 2 3 ...
##  $ baths    : num  1 1 1 1 1 1 2 1 2 2 ...
##  $ sqft     : int  836 1167 796 852 797 1122 1104 1177 941 1146 ...
##  $ type     : Factor w/ 3 levels "Condo","Multi_Family",..: 3 3 3 3 3 1 3 3 1 3 ...
##  $ price    : int  59222 68212 68880 69307 81900 89921 90895 91002 94905 98937 ...
##  $ latitude : num  38.6 38.5 38.6 38.6 38.5 ...
##  $ longitude: num  -121 -121 -121 -121 -121 ...

head(df)

##         city    zip beds baths sqft        type price latitude longitude
## 1 SACRAMENTO z95838    2     1  836 Residential 59222 38.63191 -121.4349
## 2 SACRAMENTO z95823    3     1 1167 Residential 68212 38.47890 -121.4310
## 3 SACRAMENTO z95815    2     1  796 Residential 68880 38.61830 -121.4438
## 4 SACRAMENTO z95815    2     1  852 Residential 69307 38.61684 -121.4391
## 5 SACRAMENTO z95824    2     1  797 Residential 81900 38.51947 -121.4358
## 6 SACRAMENTO z95841    3     1 1122       Condo 89921 38.66260 -121.3278

# convert non numeric types to numeric to use mlr
df$city=as.numeric(df$city)
df$type=as.numeric(df$type)
df$zip=as.numeric(substr(df$zip,2,length(df$zip)))

# split data between training and test
df$id=1:nrow(df)
n = nrow(df)
trainIndex = sample(1:n, size = round(0.8*n), replace=FALSE)
train = df[trainIndex ,]
test = df[-trainIndex ,]

# now retrain after feature engineering done on fewer variables
mlrModel = lm(price~beds+sqft+type+longitude,train) 
summary(mlrModel) # compare R sq which should be similar as before, means we still have a strong model with reduced features

## 
## Call:
## lm(formula = price ~ beds + sqft + type + longitude, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -260325  -48471   -8877   36302  392465 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.061e+07  2.472e+06   8.338 3.67e-16 ***
## beds        -2.546e+04  4.564e+03  -5.577 3.43e-08 ***
## sqft         1.476e+02  5.539e+00  26.653  < 2e-16 ***
## type         1.736e+04  6.139e+03   2.828  0.00482 ** 
## longitude    1.696e+05  2.036e+04   8.329 3.94e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 74890 on 741 degrees of freedom
## Multiple R-squared:  0.6622, Adjusted R-squared:  0.6604 
## F-statistic: 363.2 on 4 and 741 DF,  p-value: < 2.2e-16

# predict on training data
predictHousePrice = round(predict(mlrModel, newdata = train, type = "response")) # predict house prices on each row of the training data
df_predicted_train = cbind(train,predictHousePrice) # add this as a column to the training data and put in a new df
df_predicted_train$pricediff= 
  round ((df_predicted_train$price - df_predicted_train$predictHousePrice)/
           df_predicted_train$price) # add a column for the difference between predicted and actual price and view the differences

# plot the actual and predicted values 
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

df_predicted_plot=df_predicted_train %>% select(id, price, predictHousePrice) 
library(googleVis)

## Creating a generic function for 'toJSON' from package 'jsonlite' in package 'googleVis'

## 
## Welcome to googleVis version 0.6.4
## 
## Please read Google's Terms of Use
## before you start using the package:
## https://developers.google.com/terms/
## 
## Note, the plot method of googleVis will by default use
## the standard browser to display its output.
## 
## See the googleVis package vignettes for more details,
## or visit https://github.com/mages/googleVis.
## 
## To suppress this message use:
## suppressPackageStartupMessages(library(googleVis))

plot(gvisBarChart(df_predicted_plot[1:100,], options=list(height=6000))) # actual first 100 row values

## starting httpd help server ...

##  done

ggplot(df_predicted_train, aes(x=pricediff)) + geom_histogram(bins=4) # histogram distribution on the difference

# predict on test data
predictHousePrice = round(predict(mlrModel, newdata = test, type = "response")) # predict house prices on each row of the training data
df_predicted_test = cbind(test,predictHousePrice) # add this as a column to the training data and put in a new df
df_predicted_test$pricediff= round ((df_predicted_test$price - df_predicted_test$predictHousePrice)/df_predicted_test$price) # add a column for the difference between predicted and actual price and view the differences

# plot the actual and predicted values 
library(dplyr)
df_predicted_plot=df_predicted_test %>% select(id, price, predictHousePrice) 
library(googleVis)
plot(gvisBarChart(df_predicted_plot[1:100,], options=list(height=6000))) # actual first 100 row values 
ggplot(df_predicted_test, aes(x=pricediff)) + geom_histogram(bins=4) # histogram distribution on the difference

# simpler way to see how well we trained
library(broom)
(m = tidy(mlrModel))

## # A tibble: 5 x 5
##   term          estimate  std.error statistic   p.value
##   <chr>            <dbl>      <dbl>     <dbl>     <dbl>
## 1 (Intercept)  20613861. 2472288.        8.34 3.67e- 16
## 2 beds           -25456.    4564.       -5.58 3.43e-  8
## 3 sqft              148.       5.54     26.7  2.80e-110
## 4 type            17359.    6139.        2.83 4.82e-  3
## 5 longitude      169612.   20365.        8.33 3.94e- 16

m$estimate # all coefficients accessed via a data frame

## [1] 20613861.2245   -25456.3547      147.6437    17359.2603   169612.1812

varImp(mlrModel)

##             Overall
## beds       5.577102
## sqft      26.653248
## type       2.827543
## longitude  8.328741

(a = augment(mlrModel))  # reduces several lines of code into one

## # A tibble: 746 x 13
##    .rownames  price  beds  sqft  type longitude .fitted .se.fit  .resid
##    <chr>      <int> <int> <int> <dbl>     <dbl>   <dbl>   <dbl>   <dbl>
##  1 152       510000     5  3508     3     -121. 460917.   7898.  4.91e4
##  2 23        123000     4  1240     3     -121. 159290.   5873. -3.63e4
##  3 223       200000     3  1393     3     -121. 210912.   3123. -1.09e4
##  4 511       360000     2  1119     1     -122. 136844.  11664.  2.23e5
##  5 789       315000     4  2212     3     -121. 316937.   3689. -1.94e3
##  6 852       575000     4  3229     3     -121. 486047.   7137.  8.90e4
##  7 875        95000     3  1058     3     -121. 142551.   4164. -4.76e4
##  8 876        97750     3  1040     3     -121. 144377.   4055. -4.66e4
##  9 657       168750     3  1669     3     -121. 250011.   3179. -8.13e4
## 10 11        100309     3   909     3     -121. 124121.   4504. -2.38e4
## # … with 736 more rows, and 4 more variables: .hat <dbl>, .sigma <dbl>,
## #   .cooksd <dbl>, .std.resid <dbl>

head(df_predicted_train) # compare with the fitted values

##     city   zip beds baths sqft type  price latitude longitude  id
## 152   10 95757    5     4 3508    3 510000 38.38253 -121.4280 152
## 23    24 95660    4     2 1240    3 123000 38.70279 -121.3822  23
## 223    1 95843    3     2 1393    3 200000 38.70441 -121.3611 223
## 511   34 95816    2     2 1119    1 360000 38.53805 -121.5047 511
## 789   33 95678    4     3 2212    3 315000 38.77587 -121.2989 741
## 852   13 95630    4     3 3229    3 575000 38.70396 -121.1871 804
##     predictHousePrice pricediff
## 152            460917         0
## 23             159290         0
## 223            210912         0
## 511            136844         1
## 789            316937         0
## 852            486047         0

# Log Regression - use when outcome is discrete instead of continuous (linear regression).
# prep data for ML
library(dplyr)
titanic = read.csv("https://goo.gl/At238b")

str(titanic) # describe values and variable types, notice NA values in age

## 'data.frame':    1309 obs. of  14 variables:
##  $ pclass   : Factor w/ 1 level "1st": 1 1 1 1 1 1 1 1 1 1 ...
##  $ survived : int  1 1 0 0 0 1 1 0 1 0 ...
##  $ name     : Factor w/ 1307 levels "Abbing, Mr. Anthony",..: 22 24 25 26 27 31 46 47 51 55 ...
##  $ sex      : Factor w/ 2 levels "female","male": 1 2 1 2 1 2 1 2 1 2 ...
##  $ age      : num  29 0.92 2 30 25 48 63 39 53 71 ...
##  $ sibsp    : int  0 1 1 1 1 0 1 0 2 0 ...
##  $ parch    : int  0 2 2 2 2 0 0 0 0 0 ...
##  $ ticket   : Factor w/ 929 levels "110152","110413",..: 188 50 50 50 50 125 93 16 77 826 ...
##  $ fare     : num  211 152 152 152 152 ...
##  $ cabin    : Factor w/ 186 levels "A10","A11","A14",..: 44 80 80 80 80 150 146 16 62 NA ...
##  $ embarked : Factor w/ 3 levels "C","Q","S": 3 3 3 3 3 3 3 3 3 1 ...
##  $ boat     : Factor w/ 27 levels "1","10","11",..: 12 3 NA NA NA 13 2 NA 27 NA ...
##  $ body     : int  NA NA NA 135 NA NA NA NA NA 22 ...
##  $ home.dest: Factor w/ 369 levels "?Havana, Cuba",..: 309 231 231 231 231 237 162 24 22 229 ...

nrow(titanic)

## [1] 1309

head(titanic)

##   pclass survived                                            name    sex
## 1    1st        1                   Allen, Miss. Elisabeth Walton female
## 2    1st        1                  Allison, Master. Hudson Trevor   male
## 3    1st        0                    Allison, Miss. Helen Loraine female
## 4    1st        0            Allison, Mr. Hudson Joshua Creighton   male
## 5    1st        0 Allison, Mrs. Hudson J C (Bessie Waldo Daniels) female
## 6    1st        1                             Anderson, Mr. Harry   male
##     age sibsp parch ticket     fare   cabin embarked boat body
## 1 29.00     0     0  24160 211.3375      B5        S    2   NA
## 2  0.92     1     2 113781 151.5500 C22 C26        S   11   NA
## 3  2.00     1     2 113781 151.5500 C22 C26        S <NA>   NA
## 4 30.00     1     2 113781 151.5500 C22 C26        S <NA>  135
## 5 25.00     1     2 113781 151.5500 C22 C26        S <NA>   NA
## 6 48.00     0     0  19952  26.5500     E12        S    3   NA
##                         home.dest
## 1                    St Louis, MO
## 2 Montreal, PQ / Chesterville, ON
## 3 Montreal, PQ / Chesterville, ON
## 4 Montreal, PQ / Chesterville, ON
## 5 Montreal, PQ / Chesterville, ON
## 6                    New York, NY

titanic = titanic %>% filter(age>0) %>% select(-body, -boat, -cabin, -home.dest) # remove non useful variables
titanic[!complete.cases(titanic),] # check if na values

##     pclass survived                                      name    sex  age
## 149    1st        1                       Icard, Miss. Amelie female 38.0
## 250    1st        1 Stone, Mrs. George Nelson (Martha Evelyn) female 62.0
## 985    1st        0                        Storey, Mr. Thomas   male 60.5
##     sibsp parch ticket fare embarked
## 149     0     0 113572   80     <NA>
## 250     0     0 113572   80     <NA>
## 985     0     0   3701   NA        S

titanic = na.omit(titanic) # remove na values
titanic[!complete.cases(titanic),] # verify no na values

##  [1] pclass   survived name     sex      age      sibsp    parch   
##  [8] ticket   fare     embarked
## <0 rows> (or 0-length row.names)

# first analyze data intuitively
table(titanic$survived) # Survival rates in absolute numbers

## 
##   0   1 
## 618 425

table(titanic$sex, titanic$survived) # distribution of sexes

##         
##            0   1
##   female  96 290
##   male   522 135

ggplot(titanic, aes(x=factor(survived),fill=factor(sex))) + geom_bar() # distribution of sexes

# convert sex to a number
titanic$sex = as.numeric(titanic$sex)
titanic$embarked = as.numeric(titanic$embarked)

# select relevant variables to train
titanic = titanic %>% select(sex, age, sibsp, parch, fare, survived, embarked)
head(titanic) # our data set is ready for ML

##   sex   age sibsp parch     fare survived embarked
## 1   1 29.00     0     0 211.3375        1        3
## 2   2  0.92     1     2 151.5500        1        3
## 3   1  2.00     1     2 151.5500        0        3
## 4   2 30.00     1     2 151.5500        0        3
## 5   1 25.00     1     2 151.5500        0        3
## 6   2 48.00     0     0  26.5500        1        3

(n = nrow(titanic))

## [1] 1043

trainIndex = sample(1:n, size = round(0.7*n), replace=FALSE)
train = titanic[trainIndex ,]
test = titanic[-trainIndex ,]

# create the model
lrModel = glm(survived ~ ., data = train, family = binomial)
print(lrModel)

## 
## Call:  glm(formula = survived ~ ., family = binomial, data = train)
## 
## Coefficients:
## (Intercept)          sex          age        sibsp        parch  
##    4.584117    -2.312028    -0.012818    -0.452349     0.040133  
##        fare     embarked  
##    0.009282    -0.384708  
## 
## Degrees of Freedom: 729 Total (i.e. Null);  723 Residual
## Null Deviance:       987.3 
## Residual Deviance: 739   AIC: 753

predictSurvived = predict(lrModel, newdata = test, type = "response") # predicts values between 0 and 1
predictSurvived = ifelse(predictSurvived>=0.5,1,0) # use this function to predict an absolute 1 or 0

# another way to get metrics on test data predictions
library(MLmetrics)

## 
## Attaching package: 'MLmetrics'

## The following objects are masked from 'package:caret':
## 
##     MAE, RMSE

## The following object is masked from 'package:base':
## 
##     Recall

AUC(y_pred = predictSurvived, y_true = test$survived)

## [1] 0.7856871

Accuracy(y_pred = predictSurvived, y_true = test$survived)

## [1] 0.798722

Recall(y_pred = predictSurvived, y_true = test$survived)

## [1] 0.8548387

Precision(y_pred = predictSurvived, y_true = test$survived)

## [1] 0.8153846

F1_Score(y_pred = predictSurvived, y_true = test$survived)

## [1] 0.8346457

ConfusionMatrix(y_pred = predictSurvived, y_true = test$survived)

##       y_pred
## y_true   0   1
##      0 159  27
##      1  36  91

# or this
library(caret)
confusionMatrix(factor(predictSurvived), factor(test$survived))

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 159  36
##          1  27  91
##                                         
##                Accuracy : 0.7987        
##                  95% CI : (0.75, 0.8417)
##     No Information Rate : 0.5942        
##     P-Value [Acc > NIR] : 1.047e-14     
##                                         
##                   Kappa : 0.5779        
##                                         
##  Mcnemar's Test P-Value : 0.3135        
##                                         
##             Sensitivity : 0.8548        
##             Specificity : 0.7165        
##          Pos Pred Value : 0.8154        
##          Neg Pred Value : 0.7712        
##              Prevalence : 0.5942        
##          Detection Rate : 0.5080        
##    Detection Prevalence : 0.6230        
##       Balanced Accuracy : 0.7857        
##                                         
##        'Positive' Class : 0             
##

# ROCR and PR curves
library(precrec)
precrec_obj <- evalmod(labels = test$survived, scores = predictSurvived)
autoplot(precrec_obj)

# Exercise for home - for the Titanic example, retrain the model after removing features not important

lesson4.R

sangalp

2019-05-29