Regression method in R

getwd()
## [1] "C:/Users/edward cooper/Desktop/Learn R"
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.3.2
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Warning: package 'ggplot2' was built under R version 3.3.2
## Warning: package 'tidyr' was built under R version 3.3.2
## Warning: package 'readr' was built under R version 3.3.2
## Warning: package 'purrr' was built under R version 3.3.2
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag():    dplyr, stats
insurance=read.csv("insurance.csv")

str(insurance)
## 'data.frame':    1338 obs. of  7 variables:
##  $ age     : int  19 18 28 33 32 31 46 37 37 60 ...
##  $ sex     : Factor w/ 2 levels "female","male": 1 2 2 2 2 1 1 1 2 1 ...
##  $ bmi     : num  27.9 33.8 33 22.7 28.9 ...
##  $ children: int  0 1 3 0 0 0 1 3 2 0 ...
##  $ smoker  : Factor w/ 2 levels "no","yes": 2 1 1 1 1 1 1 1 1 1 ...
##  $ region  : Factor w/ 4 levels "northeast","northwest",..: 4 3 3 2 2 3 3 2 1 2 ...
##  $ charges : num  16885 1726 4449 21984 3867 ...
summary(insurance)
##       age            sex           bmi           children     smoker    
##  Min.   :18.00   female:662   Min.   :15.96   Min.   :0.000   no :1064  
##  1st Qu.:27.00   male  :676   1st Qu.:26.30   1st Qu.:0.000   yes: 274  
##  Median :39.00                Median :30.40   Median :1.000             
##  Mean   :39.21                Mean   :30.66   Mean   :1.095             
##  3rd Qu.:51.00                3rd Qu.:34.69   3rd Qu.:2.000             
##  Max.   :64.00                Max.   :53.13   Max.   :5.000             
##        region       charges     
##  northeast:324   Min.   : 1122  
##  northwest:325   1st Qu.: 4740  
##  southeast:364   Median : 9382  
##  southwest:325   Mean   :13270  
##                  3rd Qu.:16640  
##                  Max.   :63770
hist(insurance$charges)

pairs(insurance)

library(psych)
## Warning: package 'psych' was built under R version 3.3.2
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
pairs.panels(insurance)

create train and test data using caret package as always

library(caret)
## Warning: package 'caret' was built under R version 3.3.2
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
index=createDataPartition(insurance$charges,p=0.8,list=FALSE)

train_insurance=insurance[index,]
test_insurance=insurance[-index,]

apply lm method on training data

insurance_lm=lm(charges~.,data=insurance)

summary(insurance_lm)
## 
## Call:
## lm(formula = charges ~ ., data = insurance)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11304.9  -2848.1   -982.1   1393.9  29992.8 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -11938.5      987.8 -12.086  < 2e-16 ***
## age                256.9       11.9  21.587  < 2e-16 ***
## sexmale           -131.3      332.9  -0.394 0.693348    
## bmi                339.2       28.6  11.860  < 2e-16 ***
## children           475.5      137.8   3.451 0.000577 ***
## smokeryes        23848.5      413.1  57.723  < 2e-16 ***
## regionnorthwest   -353.0      476.3  -0.741 0.458769    
## regionsoutheast  -1035.0      478.7  -2.162 0.030782 *  
## regionsouthwest   -960.0      477.9  -2.009 0.044765 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6062 on 1329 degrees of freedom
## Multiple R-squared:  0.7509, Adjusted R-squared:  0.7494 
## F-statistic: 500.8 on 8 and 1329 DF,  p-value: < 2.2e-16
insurance_lm_pred=predict(insurance_lm,test_insurance)

errors_lm=mean((insurance_lm_pred-test_insurance$charges)^2)%>%sqrt


errors_lm
## [1] 5941.475
plot(insurance_lm)

## refit model with relevent predictors (age,bmi,children,smoker)

insurance_lm_refit=lm(charges~age+bmi+children+smoker,data=insurance)

summary(insurance_lm_refit)
## 
## Call:
## lm(formula = charges ~ age + bmi + children + smoker, data = insurance)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11897.9  -2920.8   -986.6   1392.2  29509.6 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -12102.77     941.98 -12.848  < 2e-16 ***
## age            257.85      11.90  21.675  < 2e-16 ***
## bmi            321.85      27.38  11.756  < 2e-16 ***
## children       473.50     137.79   3.436 0.000608 ***
## smokeryes    23811.40     411.22  57.904  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6068 on 1333 degrees of freedom
## Multiple R-squared:  0.7497, Adjusted R-squared:  0.7489 
## F-statistic: 998.1 on 4 and 1333 DF,  p-value: < 2.2e-16
insurance_lm_refit_pred=predict(insurance_lm_refit,test_insurance)


errors_lm_refit=(insurance_lm_refit_pred-test_insurance$charges)^2%>%mean%>%sqrt

errors_lm_refit
## [1] 5914.953
plot(insurance_lm_refit)

## it seems that two mdoels does not differ much.

add some non-linear term and interaction

library(dplyr)

insure_more=insurance%>%mutate(bmi30=ifelse(bmi>30,"overweight","normal")%>%factor,child2=ifelse(children>2,"MKids","LKids")%>%factor)

split training and testing data

library(caret)
index=createDataPartition(insure_more$charges,p=0.8,list=FALSE)

train_insurance_more=insure_more[index,]
test_insurance_more=insure_more[-index,]

apply lm method

insure_more_lm=lm(charges~age+age^2+bmi+children+smoker+bmi30+child2,data=train_insurance_more)

summary(insure_more_lm)
## 
## Call:
## lm(formula = charges ~ age + age^2 + bmi + children + smoker + 
##     bmi30 + child2, data = train_insurance_more)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12639.9  -3547.2   -161.6   1615.0  27951.3 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -6841.14    1361.75  -5.024 5.94e-07 ***
## age               259.24      13.10  19.786  < 2e-16 ***
## bmi                84.30      49.28   1.711  0.08740 .  
## children          683.17     240.15   2.845  0.00453 ** 
## smokeryes       23823.35     446.19  53.393  < 2e-16 ***
## bmi30overweight  3433.85     603.61   5.689 1.65e-08 ***
## child2MKids     -1164.96     802.79  -1.451  0.14704    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5957 on 1065 degrees of freedom
## Multiple R-squared:  0.7629, Adjusted R-squared:  0.7616 
## F-statistic: 571.2 on 6 and 1065 DF,  p-value: < 2.2e-16
insure_more_pred=predict(insure_more_lm,test_insurance_more)


error_insure_more=(insure_more_pred-test_insurance_more$charges)^2%>%mean%>%sqrt

error_insure_more
## [1] 6180.675

it performed a little bit better with similar adjusted R-squared

plotting the result shows that the residual is not normally distributed thus there are definitely some features not captured by the regression method

it seems that we need to separate data into two groups for further regression analysis.

plot(insure_more_lm)

Regression tree come to the rescue.

read in wine data

library(tidyverse)
library(rpart) 
## Warning: package 'rpart' was built under R version 3.3.2
wine=read.csv("whitewines.csv")

exploring dataset

str(wine)
## 'data.frame':    4898 obs. of  12 variables:
##  $ fixed.acidity       : num  7 6.3 8.1 7.2 7.2 8.1 6.2 7 6.3 8.1 ...
##  $ volatile.acidity    : num  0.27 0.3 0.28 0.23 0.23 0.28 0.32 0.27 0.3 0.22 ...
##  $ citric.acid         : num  0.36 0.34 0.4 0.32 0.32 0.4 0.16 0.36 0.34 0.43 ...
##  $ residual.sugar      : num  20.7 1.6 6.9 8.5 8.5 6.9 7 20.7 1.6 1.5 ...
##  $ chlorides           : num  0.045 0.049 0.05 0.058 0.058 0.05 0.045 0.045 0.049 0.044 ...
##  $ free.sulfur.dioxide : num  45 14 30 47 47 30 30 45 14 28 ...
##  $ total.sulfur.dioxide: num  170 132 97 186 186 97 136 170 132 129 ...
##  $ density             : num  1.001 0.994 0.995 0.996 0.996 ...
##  $ pH                  : num  3 3.3 3.26 3.19 3.19 3.26 3.18 3 3.3 3.22 ...
##  $ sulphates           : num  0.45 0.49 0.44 0.4 0.4 0.44 0.47 0.45 0.49 0.45 ...
##  $ alcohol             : num  8.8 9.5 10.1 9.9 9.9 10.1 9.6 8.8 9.5 11 ...
##  $ quality             : int  6 6 6 6 6 6 6 6 6 6 ...
summary(wine)
##  fixed.acidity    volatile.acidity  citric.acid     residual.sugar  
##  Min.   : 3.800   Min.   :0.0800   Min.   :0.0000   Min.   : 0.600  
##  1st Qu.: 6.300   1st Qu.:0.2100   1st Qu.:0.2700   1st Qu.: 1.700  
##  Median : 6.800   Median :0.2600   Median :0.3200   Median : 5.200  
##  Mean   : 6.855   Mean   :0.2782   Mean   :0.3342   Mean   : 6.391  
##  3rd Qu.: 7.300   3rd Qu.:0.3200   3rd Qu.:0.3900   3rd Qu.: 9.900  
##  Max.   :14.200   Max.   :1.1000   Max.   :1.6600   Max.   :65.800  
##    chlorides       free.sulfur.dioxide total.sulfur.dioxide
##  Min.   :0.00900   Min.   :  2.00      Min.   :  9.0       
##  1st Qu.:0.03600   1st Qu.: 23.00      1st Qu.:108.0       
##  Median :0.04300   Median : 34.00      Median :134.0       
##  Mean   :0.04577   Mean   : 35.31      Mean   :138.4       
##  3rd Qu.:0.05000   3rd Qu.: 46.00      3rd Qu.:167.0       
##  Max.   :0.34600   Max.   :289.00      Max.   :440.0       
##     density             pH          sulphates         alcohol     
##  Min.   :0.9871   Min.   :2.720   Min.   :0.2200   Min.   : 8.00  
##  1st Qu.:0.9917   1st Qu.:3.090   1st Qu.:0.4100   1st Qu.: 9.50  
##  Median :0.9937   Median :3.180   Median :0.4700   Median :10.40  
##  Mean   :0.9940   Mean   :3.188   Mean   :0.4898   Mean   :10.51  
##  3rd Qu.:0.9961   3rd Qu.:3.280   3rd Qu.:0.5500   3rd Qu.:11.40  
##  Max.   :1.0390   Max.   :3.820   Max.   :1.0800   Max.   :14.20  
##     quality     
##  Min.   :3.000  
##  1st Qu.:5.000  
##  Median :6.000  
##  Mean   :5.878  
##  3rd Qu.:6.000  
##  Max.   :9.000
hist(wine$quality)

## more pairing plot

library(psych)

pairs.panels(wine)

split data into test and train dataset

library(caret)
index=createDataPartition(wine$quality,p=0.8,list=FALSE)

train_wine=wine[index,]

test_wine=wine[-index,]

apply the rpart package

wine_tree=rpart(quality~.,data=train_wine)

wine_tree_pred=predict(wine_tree, test_wine)

library(gmodels)
## Warning: package 'gmodels' was built under R version 3.3.2
CrossTable(wine_tree_pred, test_wine$quality)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## | Chi-square contribution |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  979 
## 
##  
##                  | test_wine$quality 
##   wine_tree_pred |         3 |         4 |         5 |         6 |         7 |         8 | Row Total | 
## -----------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
## 4.90298507462687 |         0 |         5 |        17 |         4 |         1 |         0 |        27 | 
##                  |     0.138 |    23.147 |     9.658 |     5.429 |     3.192 |     0.855 |           | 
##                  |     0.000 |     0.185 |     0.630 |     0.148 |     0.037 |     0.000 |     0.028 | 
##                  |     0.000 |     0.179 |     0.058 |     0.009 |     0.006 |     0.000 |           | 
##                  |     0.000 |     0.005 |     0.017 |     0.004 |     0.001 |     0.000 |           | 
## -----------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
##            5.375 |         1 |         6 |        11 |        15 |         2 |         2 |        37 | 
##                  |     3.481 |    23.078 |     0.002 |     0.153 |     3.425 |     0.586 |           | 
##                  |     0.027 |     0.162 |     0.297 |     0.405 |     0.054 |     0.054 |     0.038 | 
##                  |     0.200 |     0.214 |     0.037 |     0.034 |     0.011 |     0.065 |           | 
##                  |     0.001 |     0.006 |     0.011 |     0.015 |     0.002 |     0.002 |           | 
## -----------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
## 5.41672946495855 |         1 |         8 |       165 |       135 |        14 |         2 |       325 | 
##                  |     0.262 |     0.180 |    45.932 |     0.791 |    35.349 |     6.680 |           | 
##                  |     0.003 |     0.025 |     0.508 |     0.415 |     0.043 |     0.006 |     0.332 | 
##                  |     0.200 |     0.286 |     0.559 |     0.308 |     0.077 |     0.065 |           | 
##                  |     0.001 |     0.008 |     0.169 |     0.138 |     0.014 |     0.002 |           | 
## -----------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
## 5.92242424242424 |         1 |         4 |        53 |       117 |        38 |         2 |       215 | 
##                  |     0.009 |     0.751 |     2.144 |     4.398 |     0.077 |     3.396 |           | 
##                  |     0.005 |     0.019 |     0.247 |     0.544 |     0.177 |     0.009 |     0.220 | 
##                  |     0.200 |     0.143 |     0.180 |     0.267 |     0.210 |     0.065 |           | 
##                  |     0.001 |     0.004 |     0.054 |     0.120 |     0.039 |     0.002 |           | 
## -----------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
## 6.18202502844141 |         2 |         4 |        42 |        98 |        62 |         8 |       216 | 
##                  |     0.729 |     0.768 |     8.189 |     0.013 |    12.192 |     0.197 |           | 
##                  |     0.009 |     0.019 |     0.194 |     0.454 |     0.287 |     0.037 |     0.221 | 
##                  |     0.400 |     0.143 |     0.142 |     0.223 |     0.343 |     0.258 |           | 
##                  |     0.002 |     0.004 |     0.043 |     0.100 |     0.063 |     0.008 |           | 
## -----------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
## 6.62310030395137 |         0 |         1 |         7 |        70 |        64 |        17 |       159 | 
##                  |     0.812 |     2.767 |    34.934 |     0.024 |    40.733 |    28.436 |           | 
##                  |     0.000 |     0.006 |     0.044 |     0.440 |     0.403 |     0.107 |     0.162 | 
##                  |     0.000 |     0.036 |     0.024 |     0.159 |     0.354 |     0.548 |           | 
##                  |     0.000 |     0.001 |     0.007 |     0.072 |     0.065 |     0.017 |           | 
## -----------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
##     Column Total |         5 |        28 |       295 |       439 |       181 |        31 |       979 | 
##                  |     0.005 |     0.029 |     0.301 |     0.448 |     0.185 |     0.032 |           | 
## -----------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
## 
## 

visualizing the tree

library(rpart.plot)

rpart.plot(wine_tree)

ggplot for predicted and test values of wine quality

library(ggplot2)
library(ggExtra)
## Warning: package 'ggExtra' was built under R version 3.3.2
library(dplyr)
p1=cbind(predict=wine_tree_pred,test=test_wine$quality)%>%as.data.frame%>%mutate(diff=abs(test-predict)                                                                     )%>%ggplot()+geom_point(aes(x=predict,y=test,color=diff))

ggMarginal(p1)

## measure performance with the mean absolute error (MAE)

(wine_tree_pred-test_wine$quality)%>%abs%>%mean
## [1] 0.6133847

improve performance by building a model tree

There is some error with the M5P fucntion on windows 10

Sys.getenv("WEKA_HOME")
## [1] "C:/Users/edward cooper/Desktop/Learn R"
Sys.setenv(WEKA_HOME="C:/Users/edward cooper/Desktop/Learn R")
library(RWeka)
## Warning: package 'RWeka' was built under R version 3.3.2
WPM("list-packages", "installed")
## Installed    Repository  Loaded  Package
## =========    ==========  ======  =======
## 
library(RWeka)
m5p=M5P(quality~.,data=train_wine)


summary(m5p)
## 
## === Summary ===
## 
## Correlation coefficient                  0.6466
## Mean absolute error                      0.5275
## Root mean squared error                  0.6786
## Relative absolute error                 78.4201 %
## Root relative squared error             76.3009 %
## Total Number of Instances             3919
m5p_pred=predict(m5p,test_wine)

(m5p_pred-test_wine$quality)%>%abs%>%mean
## [1] 0.5777675