getwd()
## [1] "C:/Users/edward cooper/Desktop/Learn R"
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.3.2
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Warning: package 'ggplot2' was built under R version 3.3.2
## Warning: package 'tidyr' was built under R version 3.3.2
## Warning: package 'readr' was built under R version 3.3.2
## Warning: package 'purrr' was built under R version 3.3.2
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag(): dplyr, stats
insurance=read.csv("insurance.csv")
str(insurance)
## 'data.frame': 1338 obs. of 7 variables:
## $ age : int 19 18 28 33 32 31 46 37 37 60 ...
## $ sex : Factor w/ 2 levels "female","male": 1 2 2 2 2 1 1 1 2 1 ...
## $ bmi : num 27.9 33.8 33 22.7 28.9 ...
## $ children: int 0 1 3 0 0 0 1 3 2 0 ...
## $ smoker : Factor w/ 2 levels "no","yes": 2 1 1 1 1 1 1 1 1 1 ...
## $ region : Factor w/ 4 levels "northeast","northwest",..: 4 3 3 2 2 3 3 2 1 2 ...
## $ charges : num 16885 1726 4449 21984 3867 ...
summary(insurance)
## age sex bmi children smoker
## Min. :18.00 female:662 Min. :15.96 Min. :0.000 no :1064
## 1st Qu.:27.00 male :676 1st Qu.:26.30 1st Qu.:0.000 yes: 274
## Median :39.00 Median :30.40 Median :1.000
## Mean :39.21 Mean :30.66 Mean :1.095
## 3rd Qu.:51.00 3rd Qu.:34.69 3rd Qu.:2.000
## Max. :64.00 Max. :53.13 Max. :5.000
## region charges
## northeast:324 Min. : 1122
## northwest:325 1st Qu.: 4740
## southeast:364 Median : 9382
## southwest:325 Mean :13270
## 3rd Qu.:16640
## Max. :63770
hist(insurance$charges)
pairs(insurance)
library(psych)
## Warning: package 'psych' was built under R version 3.3.2
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
pairs.panels(insurance)
library(caret)
## Warning: package 'caret' was built under R version 3.3.2
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
index=createDataPartition(insurance$charges,p=0.8,list=FALSE)
train_insurance=insurance[index,]
test_insurance=insurance[-index,]
insurance_lm=lm(charges~.,data=insurance)
summary(insurance_lm)
##
## Call:
## lm(formula = charges ~ ., data = insurance)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11304.9 -2848.1 -982.1 1393.9 29992.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -11938.5 987.8 -12.086 < 2e-16 ***
## age 256.9 11.9 21.587 < 2e-16 ***
## sexmale -131.3 332.9 -0.394 0.693348
## bmi 339.2 28.6 11.860 < 2e-16 ***
## children 475.5 137.8 3.451 0.000577 ***
## smokeryes 23848.5 413.1 57.723 < 2e-16 ***
## regionnorthwest -353.0 476.3 -0.741 0.458769
## regionsoutheast -1035.0 478.7 -2.162 0.030782 *
## regionsouthwest -960.0 477.9 -2.009 0.044765 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6062 on 1329 degrees of freedom
## Multiple R-squared: 0.7509, Adjusted R-squared: 0.7494
## F-statistic: 500.8 on 8 and 1329 DF, p-value: < 2.2e-16
insurance_lm_pred=predict(insurance_lm,test_insurance)
errors_lm=mean((insurance_lm_pred-test_insurance$charges)^2)%>%sqrt
errors_lm
## [1] 5941.475
plot(insurance_lm)
## refit model with relevent predictors (age,bmi,children,smoker)
insurance_lm_refit=lm(charges~age+bmi+children+smoker,data=insurance)
summary(insurance_lm_refit)
##
## Call:
## lm(formula = charges ~ age + bmi + children + smoker, data = insurance)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11897.9 -2920.8 -986.6 1392.2 29509.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -12102.77 941.98 -12.848 < 2e-16 ***
## age 257.85 11.90 21.675 < 2e-16 ***
## bmi 321.85 27.38 11.756 < 2e-16 ***
## children 473.50 137.79 3.436 0.000608 ***
## smokeryes 23811.40 411.22 57.904 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6068 on 1333 degrees of freedom
## Multiple R-squared: 0.7497, Adjusted R-squared: 0.7489
## F-statistic: 998.1 on 4 and 1333 DF, p-value: < 2.2e-16
insurance_lm_refit_pred=predict(insurance_lm_refit,test_insurance)
errors_lm_refit=(insurance_lm_refit_pred-test_insurance$charges)^2%>%mean%>%sqrt
errors_lm_refit
## [1] 5914.953
plot(insurance_lm_refit)
## it seems that two mdoels does not differ much.
library(dplyr)
insure_more=insurance%>%mutate(bmi30=ifelse(bmi>30,"overweight","normal")%>%factor,child2=ifelse(children>2,"MKids","LKids")%>%factor)
library(caret)
index=createDataPartition(insure_more$charges,p=0.8,list=FALSE)
train_insurance_more=insure_more[index,]
test_insurance_more=insure_more[-index,]
insure_more_lm=lm(charges~age+age^2+bmi+children+smoker+bmi30+child2,data=train_insurance_more)
summary(insure_more_lm)
##
## Call:
## lm(formula = charges ~ age + age^2 + bmi + children + smoker +
## bmi30 + child2, data = train_insurance_more)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12639.9 -3547.2 -161.6 1615.0 27951.3
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6841.14 1361.75 -5.024 5.94e-07 ***
## age 259.24 13.10 19.786 < 2e-16 ***
## bmi 84.30 49.28 1.711 0.08740 .
## children 683.17 240.15 2.845 0.00453 **
## smokeryes 23823.35 446.19 53.393 < 2e-16 ***
## bmi30overweight 3433.85 603.61 5.689 1.65e-08 ***
## child2MKids -1164.96 802.79 -1.451 0.14704
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5957 on 1065 degrees of freedom
## Multiple R-squared: 0.7629, Adjusted R-squared: 0.7616
## F-statistic: 571.2 on 6 and 1065 DF, p-value: < 2.2e-16
insure_more_pred=predict(insure_more_lm,test_insurance_more)
error_insure_more=(insure_more_pred-test_insurance_more$charges)^2%>%mean%>%sqrt
error_insure_more
## [1] 6180.675
plot(insure_more_lm)
library(tidyverse)
library(rpart)
## Warning: package 'rpart' was built under R version 3.3.2
wine=read.csv("whitewines.csv")
str(wine)
## 'data.frame': 4898 obs. of 12 variables:
## $ fixed.acidity : num 7 6.3 8.1 7.2 7.2 8.1 6.2 7 6.3 8.1 ...
## $ volatile.acidity : num 0.27 0.3 0.28 0.23 0.23 0.28 0.32 0.27 0.3 0.22 ...
## $ citric.acid : num 0.36 0.34 0.4 0.32 0.32 0.4 0.16 0.36 0.34 0.43 ...
## $ residual.sugar : num 20.7 1.6 6.9 8.5 8.5 6.9 7 20.7 1.6 1.5 ...
## $ chlorides : num 0.045 0.049 0.05 0.058 0.058 0.05 0.045 0.045 0.049 0.044 ...
## $ free.sulfur.dioxide : num 45 14 30 47 47 30 30 45 14 28 ...
## $ total.sulfur.dioxide: num 170 132 97 186 186 97 136 170 132 129 ...
## $ density : num 1.001 0.994 0.995 0.996 0.996 ...
## $ pH : num 3 3.3 3.26 3.19 3.19 3.26 3.18 3 3.3 3.22 ...
## $ sulphates : num 0.45 0.49 0.44 0.4 0.4 0.44 0.47 0.45 0.49 0.45 ...
## $ alcohol : num 8.8 9.5 10.1 9.9 9.9 10.1 9.6 8.8 9.5 11 ...
## $ quality : int 6 6 6 6 6 6 6 6 6 6 ...
summary(wine)
## fixed.acidity volatile.acidity citric.acid residual.sugar
## Min. : 3.800 Min. :0.0800 Min. :0.0000 Min. : 0.600
## 1st Qu.: 6.300 1st Qu.:0.2100 1st Qu.:0.2700 1st Qu.: 1.700
## Median : 6.800 Median :0.2600 Median :0.3200 Median : 5.200
## Mean : 6.855 Mean :0.2782 Mean :0.3342 Mean : 6.391
## 3rd Qu.: 7.300 3rd Qu.:0.3200 3rd Qu.:0.3900 3rd Qu.: 9.900
## Max. :14.200 Max. :1.1000 Max. :1.6600 Max. :65.800
## chlorides free.sulfur.dioxide total.sulfur.dioxide
## Min. :0.00900 Min. : 2.00 Min. : 9.0
## 1st Qu.:0.03600 1st Qu.: 23.00 1st Qu.:108.0
## Median :0.04300 Median : 34.00 Median :134.0
## Mean :0.04577 Mean : 35.31 Mean :138.4
## 3rd Qu.:0.05000 3rd Qu.: 46.00 3rd Qu.:167.0
## Max. :0.34600 Max. :289.00 Max. :440.0
## density pH sulphates alcohol
## Min. :0.9871 Min. :2.720 Min. :0.2200 Min. : 8.00
## 1st Qu.:0.9917 1st Qu.:3.090 1st Qu.:0.4100 1st Qu.: 9.50
## Median :0.9937 Median :3.180 Median :0.4700 Median :10.40
## Mean :0.9940 Mean :3.188 Mean :0.4898 Mean :10.51
## 3rd Qu.:0.9961 3rd Qu.:3.280 3rd Qu.:0.5500 3rd Qu.:11.40
## Max. :1.0390 Max. :3.820 Max. :1.0800 Max. :14.20
## quality
## Min. :3.000
## 1st Qu.:5.000
## Median :6.000
## Mean :5.878
## 3rd Qu.:6.000
## Max. :9.000
hist(wine$quality)
## more pairing plot
library(psych)
pairs.panels(wine)
library(caret)
index=createDataPartition(wine$quality,p=0.8,list=FALSE)
train_wine=wine[index,]
test_wine=wine[-index,]
wine_tree=rpart(quality~.,data=train_wine)
wine_tree_pred=predict(wine_tree, test_wine)
library(gmodels)
## Warning: package 'gmodels' was built under R version 3.3.2
CrossTable(wine_tree_pred, test_wine$quality)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 979
##
##
## | test_wine$quality
## wine_tree_pred | 3 | 4 | 5 | 6 | 7 | 8 | Row Total |
## -----------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
## 4.90298507462687 | 0 | 5 | 17 | 4 | 1 | 0 | 27 |
## | 0.138 | 23.147 | 9.658 | 5.429 | 3.192 | 0.855 | |
## | 0.000 | 0.185 | 0.630 | 0.148 | 0.037 | 0.000 | 0.028 |
## | 0.000 | 0.179 | 0.058 | 0.009 | 0.006 | 0.000 | |
## | 0.000 | 0.005 | 0.017 | 0.004 | 0.001 | 0.000 | |
## -----------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
## 5.375 | 1 | 6 | 11 | 15 | 2 | 2 | 37 |
## | 3.481 | 23.078 | 0.002 | 0.153 | 3.425 | 0.586 | |
## | 0.027 | 0.162 | 0.297 | 0.405 | 0.054 | 0.054 | 0.038 |
## | 0.200 | 0.214 | 0.037 | 0.034 | 0.011 | 0.065 | |
## | 0.001 | 0.006 | 0.011 | 0.015 | 0.002 | 0.002 | |
## -----------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
## 5.41672946495855 | 1 | 8 | 165 | 135 | 14 | 2 | 325 |
## | 0.262 | 0.180 | 45.932 | 0.791 | 35.349 | 6.680 | |
## | 0.003 | 0.025 | 0.508 | 0.415 | 0.043 | 0.006 | 0.332 |
## | 0.200 | 0.286 | 0.559 | 0.308 | 0.077 | 0.065 | |
## | 0.001 | 0.008 | 0.169 | 0.138 | 0.014 | 0.002 | |
## -----------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
## 5.92242424242424 | 1 | 4 | 53 | 117 | 38 | 2 | 215 |
## | 0.009 | 0.751 | 2.144 | 4.398 | 0.077 | 3.396 | |
## | 0.005 | 0.019 | 0.247 | 0.544 | 0.177 | 0.009 | 0.220 |
## | 0.200 | 0.143 | 0.180 | 0.267 | 0.210 | 0.065 | |
## | 0.001 | 0.004 | 0.054 | 0.120 | 0.039 | 0.002 | |
## -----------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
## 6.18202502844141 | 2 | 4 | 42 | 98 | 62 | 8 | 216 |
## | 0.729 | 0.768 | 8.189 | 0.013 | 12.192 | 0.197 | |
## | 0.009 | 0.019 | 0.194 | 0.454 | 0.287 | 0.037 | 0.221 |
## | 0.400 | 0.143 | 0.142 | 0.223 | 0.343 | 0.258 | |
## | 0.002 | 0.004 | 0.043 | 0.100 | 0.063 | 0.008 | |
## -----------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
## 6.62310030395137 | 0 | 1 | 7 | 70 | 64 | 17 | 159 |
## | 0.812 | 2.767 | 34.934 | 0.024 | 40.733 | 28.436 | |
## | 0.000 | 0.006 | 0.044 | 0.440 | 0.403 | 0.107 | 0.162 |
## | 0.000 | 0.036 | 0.024 | 0.159 | 0.354 | 0.548 | |
## | 0.000 | 0.001 | 0.007 | 0.072 | 0.065 | 0.017 | |
## -----------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
## Column Total | 5 | 28 | 295 | 439 | 181 | 31 | 979 |
## | 0.005 | 0.029 | 0.301 | 0.448 | 0.185 | 0.032 | |
## -----------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
##
##
library(rpart.plot)
rpart.plot(wine_tree)
library(ggplot2)
library(ggExtra)
## Warning: package 'ggExtra' was built under R version 3.3.2
library(dplyr)
p1=cbind(predict=wine_tree_pred,test=test_wine$quality)%>%as.data.frame%>%mutate(diff=abs(test-predict) )%>%ggplot()+geom_point(aes(x=predict,y=test,color=diff))
ggMarginal(p1)
## measure performance with the mean absolute error (MAE)
(wine_tree_pred-test_wine$quality)%>%abs%>%mean
## [1] 0.6133847
There is some error with the M5P fucntion on windows 10
Sys.getenv("WEKA_HOME")
## [1] "C:/Users/edward cooper/Desktop/Learn R"
Sys.setenv(WEKA_HOME="C:/Users/edward cooper/Desktop/Learn R")
library(RWeka)
## Warning: package 'RWeka' was built under R version 3.3.2
WPM("list-packages", "installed")
## Installed Repository Loaded Package
## ========= ========== ====== =======
##
library(RWeka)
m5p=M5P(quality~.,data=train_wine)
summary(m5p)
##
## === Summary ===
##
## Correlation coefficient 0.6466
## Mean absolute error 0.5275
## Root mean squared error 0.6786
## Relative absolute error 78.4201 %
## Root relative squared error 76.3009 %
## Total Number of Instances 3919
m5p_pred=predict(m5p,test_wine)
(m5p_pred-test_wine$quality)%>%abs%>%mean
## [1] 0.5777675