housing <- read.table("housing.data")
colnames(housing) <- c("CRIM","ZN","INDUS","CHAS","NOX","RM","AGE", "DIS","RAD","TAX","PRATIO","B","LSTAT","MDEV")
summary(housing)
## CRIM ZN INDUS CHAS
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08204 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## NOX RM AGE DIS
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## RAD TAX PRATIO B
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 0.32
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38
## Median : 5.000 Median :330.0 Median :19.05 Median :391.44
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :356.67
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :396.90
## LSTAT MDEV
## Min. : 1.73 Min. : 5.00
## 1st Qu.: 6.95 1st Qu.:17.02
## Median :11.36 Median :21.20
## Mean :12.65 Mean :22.53
## 3rd Qu.:16.95 3rd Qu.:25.00
## Max. :37.97 Max. :50.00
plot(housing)

#correlation plot
library(corrplot)
corrplot(cor(housing), method="number", tl.cex=0.5)
#partitioning
housing <- housing[order(housing$MDEV),]
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2

set.seed(3277)
trainingIndices <- createDataPartition(housing$MDEV, p=0.75, list=FALSE)
housingTraining <- housing[trainingIndices,]
housingTesting <- housing[-trainingIndices,]
nrow(housingTraining)
## [1] 381
nrow(housingTesting)
## [1] 125
#linear model
linearModel <- lm(MDEV ~ CRIM + ZN + INDUS + CHAS + NOX + RM + AGE + DIS + RAD + TAX + PRATIO + B + LSTAT, data=housingTraining)
summary(linearModel)
##
## Call:
## lm(formula = MDEV ~ CRIM + ZN + INDUS + CHAS + NOX + RM + AGE +
## DIS + RAD + TAX + PRATIO + B + LSTAT, data = housingTraining)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.1317 -2.6258 -0.5413 1.5656 26.2551
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 41.196069 5.609316 7.344 1.35e-12 ***
## CRIM -0.122053 0.032598 -3.744 0.000210 ***
## ZN 0.052261 0.015412 3.391 0.000772 ***
## INDUS 0.032047 0.068200 0.470 0.638709
## CHAS 2.385849 0.959308 2.487 0.013324 *
## NOX -17.566444 4.273389 -4.111 4.87e-05 ***
## RM 3.485134 0.463397 7.521 4.23e-13 ***
## AGE -0.003562 0.014443 -0.247 0.805317
## DIS -1.545347 0.221048 -6.991 1.30e-11 ***
## RAD 0.333380 0.076002 4.386 1.51e-05 ***
## TAX -0.014973 0.004317 -3.468 0.000586 ***
## PRATIO -0.995370 0.145592 -6.837 3.39e-11 ***
## B 0.006718 0.002832 2.373 0.018180 *
## LSTAT -0.521544 0.054005 -9.657 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.549 on 367 degrees of freedom
## Multiple R-squared: 0.7605, Adjusted R-squared: 0.752
## F-statistic: 89.63 on 13 and 367 DF, p-value: < 2.2e-16
#prediction
predicted <- predict(linearModel,newdata=housingTesting)
summary(predicted)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.8783 17.8442 21.0676 22.4296 27.2561 42.8923
summary(housingTesting$MDEV)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 8.10 17.10 21.20 22.89 25.00 50.00
plot(predicted,housingTesting$MDEV)
#function for R^2
sumofsquares <- function(x) {
return(sum(x^2))
}
#Residual sum of squares
diff <- predicted - housingTesting$MDEV
sumofsquares(diff)
## [1] 3555.882
#Logistic regression
lr <- glm(MDEV ~ CRIM + ZN + INDUS + CHAS + NOX + RM + AGE + DIS + RAD + TAX + PRATIO + B + LSTAT, data=housingTraining)
summary(lr)
##
## Call:
## glm(formula = MDEV ~ CRIM + ZN + INDUS + CHAS + NOX + RM + AGE +
## DIS + RAD + TAX + PRATIO + B + LSTAT, data = housingTraining)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -14.1317 -2.6258 -0.5413 1.5656 26.2551
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 41.196069 5.609316 7.344 1.35e-12 ***
## CRIM -0.122053 0.032598 -3.744 0.000210 ***
## ZN 0.052261 0.015412 3.391 0.000772 ***
## INDUS 0.032047 0.068200 0.470 0.638709
## CHAS 2.385849 0.959308 2.487 0.013324 *
## NOX -17.566444 4.273389 -4.111 4.87e-05 ***
## RM 3.485134 0.463397 7.521 4.23e-13 ***
## AGE -0.003562 0.014443 -0.247 0.805317
## DIS -1.545347 0.221048 -6.991 1.30e-11 ***
## RAD 0.333380 0.076002 4.386 1.51e-05 ***
## TAX -0.014973 0.004317 -3.468 0.000586 ***
## PRATIO -0.995370 0.145592 -6.837 3.39e-11 ***
## B 0.006718 0.002832 2.373 0.018180 *
## LSTAT -0.521544 0.054005 -9.657 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 20.69378)
##
## Null deviance: 31707.3 on 380 degrees of freedom
## Residual deviance: 7594.6 on 367 degrees of freedom
## AIC: 2251.3
##
## Number of Fisher Scoring iterations: 2
predicted <- predict(lr,newdata=housingTesting)
summary(predicted)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.8783 17.8442 21.0676 22.4296 27.2561 42.8923
plot(predicted,housingTesting$MDEV)

diff <- predicted - housingTesting$MDEV
sumofsquares(diff)
## [1] 3555.882
#residual plot
plot(resid(linearModel))

#prediction
x <- housingTesting$MDEV
Y <- predicted
b1 <- sum((x-mean(x))*(Y-mean(Y)))/sum((x-mean(x))^2)
b0 <- mean(Y)-b1*mean(x)
c(b0,b1)
## [1] 7.2106245 0.6648381
plot(x,Y)
abline(c(b0,b1),col="blue",lwd=2)

#relative importance
library(relaimpo)
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked _by_ '.GlobalEnv':
##
## housing
## Loading required package: boot
##
## Attaching package: 'boot'
## The following object is masked from 'package:lattice':
##
## melanoma
## Loading required package: survey
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
##
## Attaching package: 'survival'
## The following object is masked from 'package:boot':
##
## aml
## The following object is masked from 'package:caret':
##
## cluster
##
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
##
## dotchart
## Loading required package: mitools
## This is the global version of package relaimpo.
## If you are a non-US user, a version with the interesting additional metric pmvd is available
## from Ulrike Groempings web site at prof.beuth-hochschule.de/groemping.
calc.relimp(linearModel,type=c("lmg","last","first","pratt"), rela=TRUE)
## Warning in rev(variances[[p]]) - variances[[p + 1]]: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Response variable: MDEV
## Total response variance: 83.44019
## Analysis based on 381 observations
##
## 13 Regressors:
## CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PRATIO B LSTAT
## Proportion of variance explained by model: 76.05%
## Metrics are normalized to sum to 100% (rela=TRUE).
##
## Relative importance metrics:
##
## lmg last first pratt
## CRIM 0.04378500 0.0423236380 0.05959783 0.069549551
## ZN 0.04085431 0.0347151937 0.05480466 0.072324623
## INDUS 0.04927578 0.0006666234 0.08442062 -0.015510766
## CHAS 0.02068028 0.0186745066 0.01195166 0.016098916
## NOX 0.04611049 0.0510155167 0.06866322 0.129797308
## RM 0.23110043 0.1707701764 0.16468562 0.239015600
## AGE 0.03211959 0.0001836714 0.05639641 0.005826449
## DIS 0.04282755 0.1475559786 0.02469774 -0.125578499
## RAD 0.03552896 0.0580913573 0.05929346 -0.172215184
## TAX 0.05313897 0.0363198971 0.08310082 0.175938820
## PRATIO 0.11235443 0.1411152591 0.09803364 0.165972509
## B 0.02614223 0.0169947393 0.03917392 0.031322939
## LSTAT 0.26608199 0.2815734421 0.19518041 0.407457734
##
## Average coefficients for different model sizes:
##
## 1X 2Xs 3Xs 4Xs 5Xs
## CRIM -0.39658057 -0.27179045 -0.21108113 -0.17716944 -0.15605272
## ZN 0.15016161 0.10008617 0.07724633 0.06573547 0.05920013
## INDUS -0.66137913 -0.49760611 -0.38326657 -0.29972518 -0.23603446
## CHAS 6.71617551 6.28502633 5.84865357 5.37351604 4.90522742
## NOX -35.23627433 -24.37290112 -18.14707801 -14.72054067 -13.02167728
## RM 9.10534876 7.97900568 7.33195455 6.85127499 6.43337784
## AGE -0.13074649 -0.08136606 -0.05484617 -0.03952753 -0.03003803
## DIS 1.15243247 0.14262752 -0.45525720 -0.82823438 -1.06957328
## RAD -0.43523357 -0.23718077 -0.11226188 -0.02648010 0.03740961
## TAX -0.02681594 -0.02111878 -0.01748128 -0.01513793 -0.01363214
## PRATIO -2.22931346 -1.79620241 -1.57371014 -1.43633047 -1.33810121
## B 0.03185870 0.02040032 0.01517751 0.01236138 0.01063506
## LSTAT -0.94731052 -0.89595398 -0.85129784 -0.81015368 -0.77115301
## 6Xs 7Xs 8Xs 9Xs
## CRIM -0.142114473 -0.132753351 -0.126571106 -0.122731539
## ZN 0.055115133 0.052452316 0.050767854 0.049851838
## INDUS -0.185438964 -0.143646311 -0.107867355 -0.076227096
## CHAS 4.467781227 4.070243661 3.713818215 3.395429620
## NOX -12.395682217 -12.433747908 -12.880656870 -13.579740358
## RM 6.041597852 5.661331637 5.286757133 4.916192653
## AGE -0.023741516 -0.019260870 -0.015833831 -0.013014735
## DIS -1.229160682 -1.335838738 -1.407531199 -1.456002582
## RAD 0.088325237 0.131164783 0.168894108 0.203489063
## TAX -0.012698631 -0.012186554 -0.012010689 -0.012121634
## PRATIO -1.261487462 -1.199051858 -1.147196205 -1.103918452
## B 0.009490813 0.008693917 0.008119381 0.007693426
## LSTAT -0.733817032 -0.698052018 -0.663912120 -0.631502580
## 10Xs 11Xs 12Xs 13Xs
## CRIM -0.120692182 -0.120074946 -0.120600039 -0.122052773
## ZN 0.049593499 0.049928963 0.050822551 0.052261271
## INDUS -0.047386409 -0.020303414 0.005907552 0.032047013
## CHAS 3.109771129 2.850585116 2.611402361 2.385848772
## NOX -14.437938014 -15.403070024 -16.449059540 -17.566443917
## RM 4.549912278 4.188852718 3.833836628 3.485134376
## AGE -0.010529199 -0.008200006 -0.005906958 -0.003562336
## DIS -1.489241258 -1.512781857 -1.530531776 -1.545346737
## RAD 0.236344006 0.268469491 0.300618320 0.333379915
## TAX -0.012489018 -0.013093345 -0.013922910 -0.014972759
## PRATIO -1.067936637 -1.038321954 -1.014337069 -0.995369987
## B 0.007368736 0.007112789 0.006901908 0.006718093
## LSTAT -0.600942864 -0.572352417 -0.545846642 -0.521544146
#stepwise regression
library(MASS)
step <- stepAIC(linearModel, direction="both")
## Start: AIC=1168.1
## MDEV ~ CRIM + ZN + INDUS + CHAS + NOX + RM + AGE + DIS + RAD +
## TAX + PRATIO + B + LSTAT
##
## Df Sum of Sq RSS AIC
## - AGE 1 1.26 7595.9 1166.2
## - INDUS 1 4.57 7599.2 1166.3
## <none> 7594.6 1168.1
## - B 1 116.49 7711.1 1171.9
## - CHAS 1 128.00 7722.6 1172.5
## - ZN 1 237.95 7832.6 1177.9
## - TAX 1 248.95 7843.6 1178.4
## - CRIM 1 290.10 7884.7 1180.4
## - NOX 1 349.67 7944.3 1183.2
## - RAD 1 398.17 7992.8 1185.6
## - PRATIO 1 967.24 8561.9 1211.8
## - DIS 1 1011.39 8606.0 1213.7
## - RM 1 1170.50 8765.1 1220.7
## - LSTAT 1 1929.98 9524.6 1252.4
##
## Step: AIC=1166.17
## MDEV ~ CRIM + ZN + INDUS + CHAS + NOX + RM + DIS + RAD + TAX +
## PRATIO + B + LSTAT
##
## Df Sum of Sq RSS AIC
## - INDUS 1 4.53 7600.4 1164.4
## <none> 7595.9 1166.2
## + AGE 1 1.26 7594.6 1168.1
## - B 1 115.79 7711.7 1169.9
## - CHAS 1 127.38 7723.3 1170.5
## - ZN 1 248.43 7844.3 1176.4
## - TAX 1 250.17 7846.0 1176.5
## - CRIM 1 290.16 7886.0 1178.5
## - NOX 1 390.00 7985.9 1183.2
## - RAD 1 402.64 7998.5 1183.8
## - PRATIO 1 971.24 8567.1 1210.0
## - DIS 1 1065.15 8661.0 1214.2
## - RM 1 1189.61 8785.5 1219.6
## - LSTAT 1 2153.07 9748.9 1259.2
##
## Step: AIC=1164.39
## MDEV ~ CRIM + ZN + CHAS + NOX + RM + DIS + RAD + TAX + PRATIO +
## B + LSTAT
##
## Df Sum of Sq RSS AIC
## <none> 7600.4 1164.4
## + INDUS 1 4.53 7595.9 1166.2
## + AGE 1 1.22 7599.2 1166.3
## - B 1 114.05 7714.5 1168.1
## - CHAS 1 132.23 7732.6 1169.0
## - ZN 1 244.48 7844.9 1174.5
## - TAX 1 272.90 7873.3 1175.8
## - CRIM 1 293.20 7893.6 1176.8
## - NOX 1 398.54 7998.9 1181.9
## - RAD 1 410.88 8011.3 1182.5
## - PRATIO 1 968.88 8569.3 1208.1
## - DIS 1 1148.81 8749.2 1216.0
## - RM 1 1185.73 8786.1 1217.6
## - LSTAT 1 2151.58 9752.0 1257.4
#KNN Algorithm
library(class)
knnModel <- knn(train=housingTraining, test=housingTesting, cl=housingTraining$MDEV)
summary(knnModel)
## 20.8 14.9 21 18.6 18.7 19.3 11.5 13.4 13.8
## 5 4 4 3 3 3 2 2 2
## 14.1 18 18.9 19.4 20 20.4 20.6 20.9 21.4
## 2 2 2 2 2 2 2 2 2
## 21.5 22.8 22.9 23.1 24.6 24.8 25.3 27.5 28.4
## 2 2 2 2 2 2 2 2 2
## 29 33.2 50 6.3 7 10.2 11.7 12.7 13.1
## 2 2 2 1 1 1 1 1 1
## 13.2 13.3 15.2 15.4 15.6 16.1 16.2 16.3 16.6
## 1 1 1 1 1 1 1 1 1
## 16.7 17 17.1 17.7 18.2 18.3 18.4 19 19.1
## 1 1 1 1 1 1 1 1 1
## 19.2 19.9 20.3 20.5 21.2 21.7 22 22.4 23.8
## 1 1 1 1 1 1 1 1 1
## 23.9 24.2 24.3 24.4 25 26.6 28.5 29.6 29.8
## 1 1 1 1 1 1 1 1 1
## 30.1 32.2 32.4 32.9 33.1 33.8 35.1 35.2 36.2
## 1 1 1 1 1 1 1 1 1
## 37.2 37.9 46 48.8 5 5.6 7.2 7.4 7.5
## 1 1 1 1 0 0 0 0 0
## 8.3 8.4 8.5 8.7 8.8 9.5 9.6 9.7 10.4
## 0 0 0 0 0 0 0 0 0
## (Other)
## 0
plot(knnModel)

#Naive Bayes Algorithm
library(e1071)
nb <- naiveBayes(MDEV ~ CRIM + ZN + INDUS + CHAS + NOX + RM + AGE + DIS + RAD + TAX + PRATIO + B + LSTAT, data=housingTraining)
nb$tables$TAX
## TAX
## Y [,1] [,2]
## 5 666.0000 0.0000000
## 5.6 666.0000 NA
## 6.3 666.0000 NA
## 7 688.5000 31.8198052
## 7.2 666.0000 0.0000000
## 7.4 666.0000 NA
## 7.5 666.0000 NA
## 8.3 666.0000 0.0000000
## 8.4 666.0000 NA
## 8.5 666.0000 NA
## 8.7 666.0000 NA
## 8.8 666.0000 NA
## 9.5 666.0000 NA
## 9.6 666.0000 NA
## 9.7 666.0000 NA
## 10.2 666.0000 0.0000000
## 10.4 666.0000 0.0000000
## 10.5 666.0000 0.0000000
## 10.9 666.0000 0.0000000
## 11.3 666.0000 NA
## 11.5 666.0000 NA
## 11.7 666.0000 0.0000000
## 11.8 666.0000 NA
## 11.9 273.0000 NA
## 12 666.0000 NA
## 12.1 666.0000 NA
## 12.3 666.0000 NA
## 12.6 666.0000 NA
## 12.7 546.3333 207.2687466
## 12.8 666.0000 NA
## 13.1 546.3333 207.2687466
## 13.2 307.0000 NA
## 13.3 589.6667 132.2132116
## 13.4 534.5000 185.9690835
## 13.5 666.0000 NA
## 13.6 509.0000 285.6711396
## 13.8 600.2500 131.5000000
## 13.9 486.5000 253.8513344
## 14 437.0000 NA
## 14.1 666.0000 0.0000000
## 14.3 666.0000 NA
## 14.4 437.0000 NA
## 14.5 426.6667 207.2687466
## 14.6 534.5000 185.9690835
## 14.8 307.0000 NA
## 14.9 666.0000 0.0000000
## 15 666.0000 NA
## 15.2 561.3333 221.4053598
## 15.3 403.0000 NA
## 15.4 534.5000 185.9690835
## 15.6 382.3333 67.4190873
## 15.7 188.0000 NA
## 16 284.0000 NA
## 16.1 666.0000 NA
## 16.2 437.0000 NA
## 16.3 666.0000 NA
## 16.5 311.0000 NA
## 16.6 270.0000 52.3259018
## 16.7 666.0000 NA
## 16.8 391.0000 NA
## 17 403.0000 NA
## 17.1 370.5000 94.0452019
## 17.2 509.0000 222.0315293
## 17.3 188.0000 NA
## 17.4 403.0000 NA
## 17.5 329.5000 31.8198052
## 17.6 330.0000 NA
## 17.7 666.0000 NA
## 17.8 509.7500 184.8934378
## 17.9 666.0000 NA
## 18 437.0000 NA
## 18.2 320.5000 19.0918831
## 18.3 391.0000 NA
## 18.4 437.0000 NA
## 18.5 381.0000 72.1248917
## 18.6 397.5000 19.0918831
## 18.7 254.0000 42.4264069
## 18.9 300.3333 18.4752086
## 19 445.0000 312.5411973
## 19.1 534.5000 185.9690835
## 19.2 434.5000 3.5355339
## 19.3 351.2000 91.2288332
## 19.4 327.5000 61.6725222
## 19.5 384.0000 0.0000000
## 19.6 447.5000 159.7550208
## 19.7 317.0000 104.6518036
## 19.8 372.6667 63.7599665
## 19.9 666.0000 0.0000000
## 20 370.6000 176.1371625
## 20.1 397.7500 188.2133098
## 20.2 486.5000 253.8513344
## 20.3 330.3333 59.0790431
## 20.4 367.6667 73.8669976
## 20.5 253.6667 71.5984171
## 20.6 358.2500 206.3885898
## 20.7 223.5000 0.7071068
## 20.8 576.6667 154.7298721
## 20.9 330.0000 21.2132034
## 21 296.6667 15.3731367
## 21.1 223.0000 NA
## 21.2 496.3333 148.3587993
## 21.4 438.8000 220.4023593
## 21.5 339.5000 89.8025612
## 21.6 242.0000 NA
## 21.7 390.2000 158.5298079
## 21.8 528.5000 194.4543648
## 21.9 549.0000 202.6499445
## 22 266.2000 45.9913035
## 22.2 264.5000 35.1804870
## 22.3 293.0000 NA
## 22.4 277.0000 NA
## 22.5 277.0000 NA
## 22.6 273.5000 4.9497475
## 22.7 534.5000 185.9690835
## 22.8 333.3333 87.7572409
## 22.9 299.2500 29.4660822
## 23 362.0000 106.0660172
## 23.1 371.7143 134.8378215
## 23.2 541.6667 215.3516504
## 23.3 403.0000 NA
## 23.4 274.0000 43.8406204
## 23.6 283.0000 18.3847763
## 23.7 292.3333 13.2790562
## 23.8 349.2500 62.4519816
## 23.9 296.6667 64.8254065
## 24 301.5000 7.7781746
## 24.1 385.6667 33.2615895
## 24.2 305.0000 NA
## 24.3 346.6667 50.1231816
## 24.4 276.6667 53.5007788
## 24.5 391.0000 NA
## 24.6 291.5000 6.3639610
## 24.7 296.0000 24.0416306
## 24.8 315.5000 35.1804870
## 25 358.8333 160.2316032
## 25.2 223.0000 NA
## 25.3 233.0000 NA
## 26.4 193.0000 NA
## 26.5 384.0000 NA
## 26.6 291.0000 62.2253967
## 26.7 307.0000 NA
## 27 403.0000 NA
## 27.1 311.0000 NA
## 27.5 505.7500 187.6919018
## 28 281.0000 NA
## 28.1 277.0000 NA
## 28.4 249.0000 38.1837662
## 28.5 245.0000 NA
## 28.6 289.0000 NA
## 28.7 249.0000 38.1837662
## 29 332.5000 36.0624458
## 29.1 265.0000 NA
## 29.4 296.0000 NA
## 29.6 261.5000 96.8736290
## 29.8 532.0000 189.5046174
## 29.9 296.0000 NA
## 30.1 252.6667 60.7974780
## 30.3 329.0000 NA
## 30.5 398.0000 NA
## 30.7 264.0000 NA
## 30.8 252.0000 NA
## 31.1 265.0000 NA
## 31.5 307.0000 NA
## 31.6 281.5000 36.0624458
## 31.7 307.0000 NA
## 32 326.0000 101.8233765
## 32.2 285.0000 NA
## 32.4 254.0000 NA
## 32.5 193.0000 NA
## 32.9 402.0000 NA
## 33.1 291.5000 53.0330086
## 33.2 265.0000 15.5563492
## 33.3 329.0000 NA
## 33.8 264.0000 NA
## 34.6 329.0000 NA
## 34.9 325.0000 103.2375901
## 35.1 216.0000 NA
## 35.2 223.0000 NA
## 35.4 226.0000 NA
## 36 264.0000 NA
## 36.1 222.0000 NA
## 36.2 207.5000 20.5060967
## 36.4 398.0000 NA
## 36.5 264.0000 NA
## 37 398.0000 NA
## 37.2 193.0000 NA
## 37.3 245.0000 NA
## 37.6 307.0000 NA
## 37.9 193.0000 NA
## 41.3 403.0000 NA
## 41.7 307.0000 NA
## 42.3 348.0000 NA
## 43.1 264.0000 NA
## 43.8 276.0000 NA
## 44 244.0000 NA
## 44.8 307.0000 NA
## 46 216.0000 NA
## 46.7 307.0000 NA
## 48.3 307.0000 NA
## 48.5 224.0000 NA
## 48.8 264.0000 NA
## 50 419.1000 185.6223466
plot(nb$apriori)

#SVM Algorithm
pima <- read.csv("pima-indians-diabetes.data")
colnames(pima) <- c("pregnancies","glucose","bp","triceps","insulin","bmi","pedigree","age","class")
summary(pima)
## pregnancies glucose bp triceps
## Min. : 0.000 Min. : 0.0 Min. : 0.0 Min. : 0.00
## 1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 62.0 1st Qu.: 0.00
## Median : 3.000 Median :117.0 Median : 72.0 Median :23.00
## Mean : 3.842 Mean :120.9 Mean : 69.1 Mean :20.52
## 3rd Qu.: 6.000 3rd Qu.:140.0 3rd Qu.: 80.0 3rd Qu.:32.00
## Max. :17.000 Max. :199.0 Max. :122.0 Max. :99.00
## insulin bmi pedigree age
## Min. : 0.0 Min. : 0.00 Min. :0.0780 Min. :21.00
## 1st Qu.: 0.0 1st Qu.:27.30 1st Qu.:0.2435 1st Qu.:24.00
## Median : 32.0 Median :32.00 Median :0.3710 Median :29.00
## Mean : 79.9 Mean :31.99 Mean :0.4717 Mean :33.22
## 3rd Qu.:127.5 3rd Qu.:36.60 3rd Qu.:0.6250 3rd Qu.:41.00
## Max. :846.0 Max. :67.10 Max. :2.4200 Max. :81.00
## class
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.3481
## 3rd Qu.:1.0000
## Max. :1.0000
str(pima)
## 'data.frame': 767 obs. of 9 variables:
## $ pregnancies: int 1 8 1 0 5 3 10 2 8 4 ...
## $ glucose : int 85 183 89 137 116 78 115 197 125 110 ...
## $ bp : int 66 64 66 40 74 50 0 70 96 92 ...
## $ triceps : int 29 0 23 35 0 32 0 45 0 0 ...
## $ insulin : int 0 0 94 168 0 88 0 543 0 0 ...
## $ bmi : num 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 37.6 ...
## $ pedigree : num 0.351 0.672 0.167 2.288 0.201 ...
## $ age : int 31 32 21 33 30 26 29 53 54 30 ...
## $ class : int 0 1 0 1 0 1 0 1 1 0 ...
pima$class <- as.factor(pima$class)
set.seed(3277)
library(caret)
pimaIndices <- createDataPartition(pima$class, p=0.75, list=FALSE)
pimaTraining <- pima[pimaIndices,]
pimaTesting <- pima[-pimaIndices,]
library(kernlab)
##
## Attaching package: 'kernlab'
## The following object is masked from 'package:ggplot2':
##
## alpha
bootControl <- trainControl(number = 20)
svmFit <- train(pimaTraining[,-9], pimaTraining[,9], method="svmRadial", tuneLength=5, trControl=bootControl, scaled=FALSE)
svmFit
## Support Vector Machines with Radial Basis Function Kernel
##
## 576 samples
## 8 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Bootstrapped (20 reps)
## Summary of sample sizes: 576, 576, 576, 576, 576, 576, ...
## Resampling results across tuning parameters:
##
## C Accuracy Kappa
## 0.25 0.6493652 0
## 0.50 0.6493652 0
## 1.00 0.6493652 0
## 2.00 0.6493652 0
## 4.00 0.6493652 0
##
## Tuning parameter 'sigma' was held constant at a value of 0.1106031
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.1106031 and C = 0.25.
predicted <- predict(svmFit$finalModel,newdata=pimaTesting[,-9])
plot(pimaTesting$class,predicted)

table(pred = predicted, true = pimaTesting[,9])
## true
## pred 0 1
## 0 125 66
## 1 0 0
svmFit$finalModel
## Support Vector Machine object of class "ksvm"
##
## SV type: C-svc (classification)
## parameter : cost C = 0.25
##
## Gaussian Radial Basis kernel function.
## Hyperparameter : sigma = 0.110603110858516
##
## Number of Support Vectors : 576
##
## Objective Function Value : -90.823
## Training error : 0.348958
#k means clustering
data(iris)
irisIndices <- createDataPartition(iris$Species, p=0.75, list=FALSE)
irisTraining <- iris[irisIndices,]
irisTesting <- iris[-irisIndices,]
bootControl <- trainControl(number = 20)
km <- kmeans(irisTraining[,1:4], 3)
km
## K-means clustering with 3 clusters of sizes 48, 38, 28
##
## Cluster means:
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 5.906250 2.741667 4.402083 1.4354167
## 2 5.010526 3.426316 1.468421 0.2447368
## 3 6.878571 3.042857 5.810714 2.0035714
##
## Clustering vector:
## 1 2 3 4 6 7 8 10 11 12 13 14 16 17 18 19 21 23
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 25 26 27 28 30 32 33 34 36 37 39 40 42 43 44 45 46 47
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 49 50 51 52 53 54 55 56 57 58 59 61 63 64 65 66 67 68
## 2 2 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1
## 70 72 73 74 75 76 77 81 82 83 84 85 87 88 89 90 91 95
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 96 97 99 100 102 104 105 106 107 108 109 111 112 113 114 115 116 117
## 1 1 1 1 1 3 3 3 1 3 3 3 3 3 1 1 3 3
## 118 119 122 123 124 125 126 127 129 130 131 132 133 135 136 137 138 139
## 3 3 1 3 1 3 3 1 3 3 3 3 3 3 3 3 3 1
## 140 143 144 147 149 150
## 3 1 3 1 3 1
##
## Within cluster sum of squares by cluster:
## [1] 31.72437 12.36553 19.69214
## (between_SS / total_SS = 87.8 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
library(clue)
cl_predict(km,irisTesting[,-5])
## Class ids:
## [1] 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 3 1 1 1 1 1 1 1 3 3 3 1 3 1 1 3 3 3 3
## [36] 3
irisTesting[,5]
## [1] setosa setosa setosa setosa setosa setosa
## [7] setosa setosa setosa setosa setosa setosa
## [13] versicolor versicolor versicolor versicolor versicolor versicolor
## [19] versicolor versicolor versicolor versicolor versicolor versicolor
## [25] virginica virginica virginica virginica virginica virginica
## [31] virginica virginica virginica virginica virginica virginica
## Levels: setosa versicolor virginica
#Decision Tree
library(rpart)
housingFit <- rpart(MDEV ~ CRIM + ZN + INDUS + CHAS + NOX + RM + AGE + DIS + RAD + TAX + PRATIO + B + LSTAT, method="anova", data=housingTraining)
plot(housingFit)
text(housingFit, use.n=TRUE, all=TRUE, cex=.8)

treePredict <- predict(housingFit,newdata=housingTesting)
diff <- treePredict - housingTesting$MDEV
sumofsquares <- function(x) {return(sum(x^2))}
sumofsquares(diff)
## [1] 3926.297
#AdaBoost
library(ada)
adaModel <- ada(x=pimaTraining[,-9],y=pimaTraining$class,test.x=pimaTesting[,-9],
test.y=pimaTesting$class)
adaModel
## Call:
## ada(pimaTraining[, -9], y = pimaTraining$class, test.x = pimaTesting[,
## -9], test.y = pimaTesting$class)
##
## Loss: exponential Method: discrete Iteration: 50
##
## Final Confusion Matrix for Data:
## Final Prediction
## True value 0 1
## 0 351 24
## 1 32 169
##
## Train Error: 0.097
##
## Out-Of-Bag Error: 0.134 iteration= 48
##
## Additional Estimates of number of iterations:
##
## train.err1 train.kap1 test.err2 test.kap2
## 50 50 3 3
(358+168)/(358+168+33+17)
## [1] 0.9131944
#neural network
library(neuralnet)
nnet <- neuralnet(MDEV ~ CRIM + ZN + INDUS + CHAS + NOX + RM + AGE
+ DIS + RAD + TAX + PRATIO + B + LSTAT,housingTraining, hidden=10,
threshold=0.01)
nnet <- neuralnet(MDEV ~ CRIM + ZN + INDUS + CHAS + NOX + RM + AGE +
DIS + RAD + TAX + PRATIO + B + LSTAT,housingTraining)
plot(nnet, rep="best")

results <- compute(nnet, housingTesting[,-14])
diff <- results$net.result - housingTesting$MDEV
sumofsquares(diff)
## [1] 11016.02175
#random forests Algorithm
library(randomForest)
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
forestFit <- randomForest(MDEV ~ CRIM + ZN + INDUS + CHAS + NOX + RM
+ AGE + DIS + RAD + TAX + PRATIO + B + LSTAT, data=housingTraining)
forestPredict <- predict(forestFit,newdata=housingTesting)
diff <- forestPredict - housingTesting$MDEV
sumofsquares(diff)
## [1] 2322.754856
#Decision Tree
library(rattle)
## Rattle: A free graphical interface for data science with R.
## Version 5.1.0 Copyright (c) 2006-2017 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
##
## Attaching package: 'rattle'
## The following object is masked from 'package:randomForest':
##
## importance
weather <- read.csv("weather.csv")
summary(weather)
## Date Location MinTemp MaxTemp
## 2007-11-01: 1 Canberra:366 Min. :-5.300000 Min. : 7.60000
## 2007-11-02: 1 1st Qu.: 2.300000 1st Qu.:15.02500
## 2007-11-03: 1 Median : 7.450000 Median :19.65000
## 2007-11-04: 1 Mean : 7.265574 Mean :20.55027
## 2007-11-05: 1 3rd Qu.:12.500000 3rd Qu.:25.50000
## 2007-11-06: 1 Max. :20.900000 Max. :35.80000
## (Other) :360
## Rainfall Evaporation Sunshine WindGustDir
## Min. : 0.000000 Min. : 0.200000 Min. : 0.000000 NW : 73
## 1st Qu.: 0.000000 1st Qu.: 2.200000 1st Qu.: 5.950000 NNW : 44
## Median : 0.000000 Median : 4.200000 Median : 8.600000 E : 37
## Mean : 1.428415 Mean : 4.521858 Mean : 7.909366 WNW : 35
## 3rd Qu.: 0.200000 3rd Qu.: 6.400000 3rd Qu.:10.500000 ENE : 30
## Max. :39.800000 Max. :13.800000 Max. :13.600000 (Other):144
## NA's :3 NA's : 3
## WindGustSpeed WindDir9am WindDir3pm WindSpeed9am
## Min. :13.00000 SE : 47 NW : 61 Min. : 0.000000
## 1st Qu.:31.00000 SSE : 40 WNW : 61 1st Qu.: 6.000000
## Median :39.00000 NNW : 36 NNW : 47 Median : 7.000000
## Mean :39.84066 N : 31 N : 30 Mean : 9.651811
## 3rd Qu.:46.00000 NW : 30 ESE : 27 3rd Qu.:13.000000
## Max. :98.00000 (Other):151 (Other):139 Max. :41.000000
## NA's :2 NA's : 31 NA's : 1 NA's :7
## WindSpeed3pm Humidity9am Humidity3pm
## Min. : 0.00000 Min. :36.00000 Min. :13.00000
## 1st Qu.:11.00000 1st Qu.:64.00000 1st Qu.:32.25000
## Median :17.00000 Median :72.00000 Median :43.00000
## Mean :17.98634 Mean :72.03552 Mean :44.51913
## 3rd Qu.:24.00000 3rd Qu.:81.00000 3rd Qu.:55.00000
## Max. :52.00000 Max. :99.00000 Max. :96.00000
##
## Pressure9am Pressure3pm Cloud9am Cloud3pm
## Min. : 996.500 Min. : 996.800 Min. :0.00000 Min. :0.00000
## 1st Qu.:1015.350 1st Qu.:1012.800 1st Qu.:1.00000 1st Qu.:1.00000
## Median :1020.150 Median :1017.400 Median :3.50000 Median :4.00000
## Mean :1019.709 Mean :1016.810 Mean :3.89071 Mean :4.02459
## 3rd Qu.:1024.475 3rd Qu.:1021.475 3rd Qu.:7.00000 3rd Qu.:7.00000
## Max. :1035.700 Max. :1033.200 Max. :8.00000 Max. :8.00000
##
## Temp9am Temp3pm RainToday RISK_MM
## Min. : 0.10000 Min. : 5.10000 No :300 Min. : 0.000000
## 1st Qu.: 7.62500 1st Qu.:14.15000 Yes: 66 1st Qu.: 0.000000
## Median :12.55000 Median :18.55000 Median : 0.000000
## Mean :12.35847 Mean :19.23087 Mean : 1.428415
## 3rd Qu.:17.00000 3rd Qu.:24.00000 3rd Qu.: 0.200000
## Max. :24.70000 Max. :34.50000 Max. :39.800000
##
## RainTomorrow
## No :300
## Yes: 66
##
##
##
##
##
weather2 <- subset(weather,select=-c(RISK_MM))
library(rpart)
model <- rpart(formula=RainTomorrow ~ .,data=weather2, method="class")
summary(model)
## Call:
## rpart(formula = RainTomorrow ~ ., data = weather2, method = "class")
## n= 366
##
## CP nsplit rel error xerror xstd
## 1 1.00 0 1 1.0000000000 0.1114417997
## 2 0.01 1 0 0.8333333333 0.1035802753
##
## Variable importance
## Date Humidity3pm Cloud3pm WindGustSpeed MinTemp
## 64 13 8 6 5
## Sunshine
## 5
##
## Node number 1: 366 observations, complexity param=1
## predicted class=No expected loss=0.1803278689 P(node) =1
## class counts: 300 66
## probabilities: 0.820 0.180
## left son=2 (300 obs) right son=3 (66 obs)
## Primary splits:
## Date splits as RRRRLLLLRLLLLLLLRLLLLRLLLLLLLRRLRLLLRLLLLLLLRRLLRRRRLRLLRLLLLLLLLLLLLLLLRLLLRLRRLLLLLLLLLLRRLLRRLLRLLLRRLLLLLLLRRLLLLLLRLLLLLLLRLLLLLLLLLLLLLLLLRRLLLLLLLRLLLLLLLLLRRLLLLLLLLLLLLLRLLLLLLLLLLLLLLLLLRLLLLLLLLLRRLLLLLLLRLLLLLRRLLLLLLLLRLLLLLLLLLLRLLLLLLRLLRLLLLLLLRLRLLLLLLRLLLLRLLLRLRLLLLLLLLLLLLLLLLLLLLLLRRLLLLLLLLLLLLRRRLLLLLLRLLLLLLLLLLRRLRLLLLLLRLLLLLLLLLLLLLLLLLL, improve=108.19672130, (0 missing)
## Humidity3pm < 71.5 to the left, improve= 18.31012675, (0 missing)
## Pressure3pm < 1011.9 to the right, improve= 17.35279669, (0 missing)
## Cloud3pm < 6.5 to the left, improve= 16.14203133, (0 missing)
## Sunshine < 6.45 to the right, improve= 15.36363823, (3 missing)
## Surrogate splits:
## Humidity3pm < 71.5 to the left, agree=0.855, adj=0.197, (0 split)
## Cloud3pm < 7.5 to the left, agree=0.842, adj=0.121, (0 split)
## WindGustSpeed < 64 to the left, agree=0.836, adj=0.091, (0 split)
## MinTemp < 17.55 to the left, agree=0.833, adj=0.076, (0 split)
## Sunshine < 0.25 to the right, agree=0.833, adj=0.076, (0 split)
##
## Node number 2: 300 observations
## predicted class=No expected loss=0 P(node) =0.8196721311
## class counts: 300 0
## probabilities: 1.000 0.000
##
## Node number 3: 66 observations
## predicted class=Yes expected loss=0 P(node) =0.1803278689
## class counts: 0 66
## probabilities: 0.000 1.000
library(rpart.plot)
fancyRpartPlot(model,main="Rain Tomorrow")

#regression
forestfires <- read.csv("forestfires.csv")
summary(forestfires)
## X Y month day
## Min. :1.000000 Min. :2.000000 aug :184 fri:85
## 1st Qu.:3.000000 1st Qu.:4.000000 sep :172 mon:74
## Median :4.000000 Median :4.000000 mar : 54 sat:84
## Mean :4.669246 Mean :4.299807 jul : 32 sun:95
## 3rd Qu.:7.000000 3rd Qu.:5.000000 feb : 20 thu:61
## Max. :9.000000 Max. :9.000000 jun : 17 tue:64
## (Other): 38 wed:54
## FFMC DMC DC
## Min. :18.70000 Min. : 1.1000 Min. : 7.90
## 1st Qu.:90.20000 1st Qu.: 68.6000 1st Qu.:437.70
## Median :91.60000 Median :108.3000 Median :664.20
## Mean :90.64468 Mean :110.8723 Mean :547.94
## 3rd Qu.:92.90000 3rd Qu.:142.4000 3rd Qu.:713.90
## Max. :96.20000 Max. :291.3000 Max. :860.60
##
## ISI temp RH
## Min. : 0.000000 Min. : 2.20000 Min. : 15.0000
## 1st Qu.: 6.500000 1st Qu.:15.50000 1st Qu.: 33.0000
## Median : 8.400000 Median :19.30000 Median : 42.0000
## Mean : 9.021663 Mean :18.88917 Mean : 44.2882
## 3rd Qu.:10.800000 3rd Qu.:22.80000 3rd Qu.: 53.0000
## Max. :56.100000 Max. :33.30000 Max. :100.0000
##
## wind rain area
## Min. :0.400000 Min. :0.00000000 Min. : 0.00000
## 1st Qu.:2.700000 1st Qu.:0.00000000 1st Qu.: 0.00000
## Median :4.000000 Median :0.00000000 Median : 0.52000
## Mean :4.017602 Mean :0.02166344 Mean : 12.84729
## 3rd Qu.:4.900000 3rd Qu.:0.00000000 3rd Qu.: 6.57000
## Max. :9.400000 Max. :6.40000000 Max. :1090.84000
##
model <- lm(formula = area ~ month + temp + wind + rain, data=forestfires)
summary(model)
##
## Call:
## lm(formula = area ~ month + temp + wind + rain, data = forestfires)
##
## Residuals:
## Min 1Q Median 3Q Max
## -33.20126 -14.92936 -9.09864 -1.65641 1063.58600
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -17.389663 24.531469 -0.70887 0.478733
## monthaug -10.342265 22.761152 -0.45438 0.649750
## monthdec 11.533891 30.895985 0.37331 0.709073
## monthfeb 2.606651 25.795917 0.10105 0.919552
## monthjan 5.988231 50.493480 0.11859 0.905644
## monthjul -8.821520 25.068066 -0.35190 0.725059
## monthjun -15.469199 26.973935 -0.57349 0.566572
## monthmar -6.630304 23.057135 -0.28756 0.773802
## monthmay 6.602782 50.052985 0.13192 0.895104
## monthnov -8.243943 67.451398 -0.12222 0.902773
## monthoct -8.267770 27.236986 -0.30355 0.761597
## monthsep -1.070170 22.487923 -0.04759 0.962063
## temp 1.569244 0.673020 2.33165 0.020114 *
## wind 1.581449 1.710697 0.92445 0.355698
## rain -3.179054 9.595489 -0.33131 0.740551
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 63.98916 on 502 degrees of freedom
## Multiple R-squared: 0.01691589, Adjusted R-squared: -0.0105008
## F-statistic: 0.6169923 on 14 and 502 DF, p-value: 0.8518111
model <- lm(formula = area ~ month + wind + rain, data=forestfires)
summary(model)
##
## Call:
## lm(formula = area ~ month + wind + rain, data = forestfires)
##
## Residuals:
## Min 1Q Median 3Q Max
## -22.16857 -14.38907 -10.45919 -3.87373 1072.43486
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.0125953 22.8495756 0.17561 0.86067
## monthaug 4.3131721 21.9723517 0.19630 0.84445
## monthdec 1.3259312 30.7188113 0.04316 0.96559
## monthfeb -1.6630582 25.8440653 -0.06435 0.94872
## monthjan -6.1033878 50.4474881 -0.12098 0.90375
## monthjul 6.4647558 24.3020651 0.26602 0.79034
## monthjun -2.4944398 26.5098551 -0.09409 0.92507
## monthmar -4.8431458 23.1457972 -0.20925 0.83434
## monthmay 10.5753914 50.2441062 0.21048 0.83338
## monthnov -8.7168784 67.7479071 -0.12867 0.89767
## monthoct -0.9916663 27.1766956 -0.03649 0.97091
## monthsep 10.2109631 22.0579191 0.46292 0.64362
## wind 1.0453962 1.7026381 0.61399 0.53950
## rain -1.8504292 9.6207046 -0.19234 0.84755
##
## Residual standard error: 64.27074 on 503 degrees of freedom
## Multiple R-squared: 0.006269255, Adjusted R-squared: -0.01941365
## F-statistic: 0.2441023 on 13 and 503 DF, p-value: 0.9971217
plot(model)
## Warning: not plotting observations with leverage one:
## 517


## Warning: not plotting observations with leverage one:
## 517


#ANN
bupa <- read.csv("bupa.data")
colnames(bupa) <- c("mcv","alkphos","alamine","aspartate","glutamyl","drinks","selector")
summary(bupa)
## mcv alkphos alamine
## Min. : 65.00000 Min. : 23.00000 Min. : 4.00000
## 1st Qu.: 87.00000 1st Qu.: 57.00000 1st Qu.: 19.00000
## Median : 90.00000 Median : 67.00000 Median : 26.00000
## Mean : 90.17442 Mean : 69.80523 Mean : 30.36337
## 3rd Qu.: 93.00000 3rd Qu.: 80.00000 3rd Qu.: 34.00000
## Max. :103.00000 Max. :138.00000 Max. :155.00000
## aspartate glutamyl drinks
## Min. : 5.00000 Min. : 5.00000 Min. : 0.000000
## 1st Qu.:19.00000 1st Qu.: 15.00000 1st Qu.: 0.500000
## Median :23.00000 Median : 24.50000 Median : 3.000000
## Mean :24.63663 Mean : 38.30523 Mean : 3.465116
## 3rd Qu.:27.00000 3rd Qu.: 46.25000 3rd Qu.: 6.000000
## Max. :82.00000 Max. :297.00000 Max. :20.000000
## selector
## Min. :1.000000
## 1st Qu.:1.000000
## Median :2.000000
## Mean :1.581395
## 3rd Qu.:2.000000
## Max. :2.000000
nn <- neuralnet(selector~mcv+alkphos+alamine+aspartate+glutamyl+drinks, data=bupa, linear.output=FALSE, hidden=2)
nn$result.matrix
## 1
## error 100.00134410555
## reached.threshold 0.00245301475
## steps 47.00000000000
## Intercept.to.1layhid1 0.41602844406
## mcv.to.1layhid1 -0.10920845301
## alkphos.to.1layhid1 0.53367967640
## alamine.to.1layhid1 2.65321469994
## aspartate.to.1layhid1 0.49399792808
## glutamyl.to.1layhid1 0.50537073025
## drinks.to.1layhid1 0.03765883152
## Intercept.to.1layhid2 0.25098970910
## mcv.to.1layhid2 0.47460785268
## alkphos.to.1layhid2 -0.54659321629
## alamine.to.1layhid2 0.21558022509
## aspartate.to.1layhid2 -0.13052791775
## glutamyl.to.1layhid2 0.69013092149
## drinks.to.1layhid2 -0.95369381418
## Intercept.to.selector 4.19048690517
## 1layhid.1.to.selector 5.29556987846
## 1layhid.2.to.selector 2.81924720338
plot(nn)
#SVM
library(kernlab)
data("spam")
summary(spam)
## make address all
## Min. :0.0000000 Min. : 0.0000000 Min. :0.0000000
## 1st Qu.:0.0000000 1st Qu.: 0.0000000 1st Qu.:0.0000000
## Median :0.0000000 Median : 0.0000000 Median :0.0000000
## Mean :0.1045534 Mean : 0.2130146 Mean :0.2806564
## 3rd Qu.:0.0000000 3rd Qu.: 0.0000000 3rd Qu.:0.4200000
## Max. :4.5400000 Max. :14.2800000 Max. :5.1000000
## num3d our over
## Min. : 0.00000000 Min. : 0.0000000 Min. :0.00000000
## 1st Qu.: 0.00000000 1st Qu.: 0.0000000 1st Qu.:0.00000000
## Median : 0.00000000 Median : 0.0000000 Median :0.00000000
## Mean : 0.06542491 Mean : 0.3122234 Mean :0.09590089
## 3rd Qu.: 0.00000000 3rd Qu.: 0.3800000 3rd Qu.:0.00000000
## Max. :42.81000000 Max. :10.0000000 Max. :5.88000000
## remove internet order
## Min. :0.0000000 Min. : 0.0000000 Min. :0.00000000
## 1st Qu.:0.0000000 1st Qu.: 0.0000000 1st Qu.:0.00000000
## Median :0.0000000 Median : 0.0000000 Median :0.00000000
## Mean :0.1142078 Mean : 0.1052945 Mean :0.09006738
## 3rd Qu.:0.0000000 3rd Qu.: 0.0000000 3rd Qu.:0.00000000
## Max. :7.2700000 Max. :11.1100000 Max. :5.26000000
## mail receive will
## Min. : 0.0000000 Min. :0.00000000 Min. :0.0000000
## 1st Qu.: 0.0000000 1st Qu.:0.00000000 1st Qu.:0.0000000
## Median : 0.0000000 Median :0.00000000 Median :0.1000000
## Mean : 0.2394132 Mean :0.05982395 Mean :0.5417018
## 3rd Qu.: 0.1600000 3rd Qu.:0.00000000 3rd Qu.:0.8000000
## Max. :18.1800000 Max. :2.61000000 Max. :9.6700000
## people report addresses
## Min. :0.00000000 Min. : 0.00000000 Min. :0.00000000
## 1st Qu.:0.00000000 1st Qu.: 0.00000000 1st Qu.:0.00000000
## Median :0.00000000 Median : 0.00000000 Median :0.00000000
## Mean :0.09392958 Mean : 0.05862639 Mean :0.04920452
## 3rd Qu.:0.00000000 3rd Qu.: 0.00000000 3rd Qu.:0.00000000
## Max. :5.55000000 Max. :10.00000000 Max. :4.41000000
## free business email
## Min. : 0.0000000 Min. :0.0000000 Min. :0.0000000
## 1st Qu.: 0.0000000 1st Qu.:0.0000000 1st Qu.:0.0000000
## Median : 0.0000000 Median :0.0000000 Median :0.0000000
## Mean : 0.2488481 Mean :0.1425864 Mean :0.1847446
## 3rd Qu.: 0.1000000 3rd Qu.:0.0000000 3rd Qu.:0.0000000
## Max. :20.0000000 Max. :7.1400000 Max. :9.0900000
## you credit your
## Min. : 0.0000 Min. : 0.00000000 Min. : 0.0000000
## 1st Qu.: 0.0000 1st Qu.: 0.00000000 1st Qu.: 0.0000000
## Median : 1.3100 Median : 0.00000000 Median : 0.2200000
## Mean : 1.6621 Mean : 0.08557705 Mean : 0.8097609
## 3rd Qu.: 2.6400 3rd Qu.: 0.00000000 3rd Qu.: 1.2700000
## Max. :18.7500 Max. :18.18000000 Max. :11.1100000
## font num000 money
## Min. : 0.0000000 Min. :0.0000000 Min. : 0.00000000
## 1st Qu.: 0.0000000 1st Qu.:0.0000000 1st Qu.: 0.00000000
## Median : 0.0000000 Median :0.0000000 Median : 0.00000000
## Mean : 0.1212019 Mean :0.1016453 Mean : 0.09426864
## 3rd Qu.: 0.0000000 3rd Qu.:0.0000000 3rd Qu.: 0.00000000
## Max. :17.1000000 Max. :5.4500000 Max. :12.50000000
## hp hpl george
## Min. : 0.0000000 Min. : 0.0000000 Min. : 0.0000000
## 1st Qu.: 0.0000000 1st Qu.: 0.0000000 1st Qu.: 0.0000000
## Median : 0.0000000 Median : 0.0000000 Median : 0.0000000
## Mean : 0.5495045 Mean : 0.2653836 Mean : 0.7673049
## 3rd Qu.: 0.0000000 3rd Qu.: 0.0000000 3rd Qu.: 0.0000000
## Max. :20.8300000 Max. :16.6600000 Max. :33.3300000
## num650 lab labs
## Min. :0.0000000 Min. : 0.00000000 Min. :0.0000000
## 1st Qu.:0.0000000 1st Qu.: 0.00000000 1st Qu.:0.0000000
## Median :0.0000000 Median : 0.00000000 Median :0.0000000
## Mean :0.1248446 Mean : 0.09891545 Mean :0.1028516
## 3rd Qu.:0.0000000 3rd Qu.: 0.00000000 3rd Qu.:0.0000000
## Max. :9.0900000 Max. :14.28000000 Max. :5.8800000
## telnet num857 data
## Min. : 0.00000000 Min. :0.00000000 Min. : 0.00000000
## 1st Qu.: 0.00000000 1st Qu.:0.00000000 1st Qu.: 0.00000000
## Median : 0.00000000 Median :0.00000000 Median : 0.00000000
## Mean : 0.06475331 Mean :0.04704847 Mean : 0.09722886
## 3rd Qu.: 0.00000000 3rd Qu.:0.00000000 3rd Qu.: 0.00000000
## Max. :12.50000000 Max. :4.76000000 Max. :18.18000000
## num415 num85 technology
## Min. :0.00000000 Min. : 0.0000000 Min. :0.00000000
## 1st Qu.:0.00000000 1st Qu.: 0.0000000 1st Qu.:0.00000000
## Median :0.00000000 Median : 0.0000000 Median :0.00000000
## Mean :0.04783525 Mean : 0.1054119 Mean :0.09747664
## 3rd Qu.:0.00000000 3rd Qu.: 0.0000000 3rd Qu.:0.00000000
## Max. :4.76000000 Max. :20.0000000 Max. :7.69000000
## num1999 parts pm
## Min. :0.0000000 Min. :0.00000000 Min. : 0.00000000
## 1st Qu.:0.0000000 1st Qu.:0.00000000 1st Qu.: 0.00000000
## Median :0.0000000 Median :0.00000000 Median : 0.00000000
## Mean :0.1369528 Mean :0.01320148 Mean : 0.07862856
## 3rd Qu.:0.0000000 3rd Qu.:0.00000000 3rd Qu.: 0.00000000
## Max. :6.8900000 Max. :8.33000000 Max. :11.11000000
## direct cs meeting
## Min. :0.00000000 Min. :0.00000000 Min. : 0.0000000
## 1st Qu.:0.00000000 1st Qu.:0.00000000 1st Qu.: 0.0000000
## Median :0.00000000 Median :0.00000000 Median : 0.0000000
## Mean :0.06483373 Mean :0.04366659 Mean : 0.1323386
## 3rd Qu.:0.00000000 3rd Qu.:0.00000000 3rd Qu.: 0.0000000
## Max. :4.76000000 Max. :7.14000000 Max. :14.2800000
## original project re
## Min. :0.00000000 Min. : 0.00000000 Min. : 0.0000000
## 1st Qu.:0.00000000 1st Qu.: 0.00000000 1st Qu.: 0.0000000
## Median :0.00000000 Median : 0.00000000 Median : 0.0000000
## Mean :0.04609867 Mean : 0.07919583 Mean : 0.3012236
## 3rd Qu.:0.00000000 3rd Qu.: 0.00000000 3rd Qu.: 0.1100000
## Max. :3.57000000 Max. :20.00000000 Max. :21.4200000
## edu table conference
## Min. : 0.000000 Min. :0.000000000 Min. : 0.00000000
## 1st Qu.: 0.000000 1st Qu.:0.000000000 1st Qu.: 0.00000000
## Median : 0.000000 Median :0.000000000 Median : 0.00000000
## Mean : 0.179824 Mean :0.005444469 Mean : 0.03186916
## 3rd Qu.: 0.000000 3rd Qu.:0.000000000 3rd Qu.: 0.00000000
## Max. :22.050000 Max. :2.170000000 Max. :10.00000000
## charSemicolon charRoundbracket charSquarebracket
## Min. :0.00000000 Min. :0.0000000 Min. :0.00000000
## 1st Qu.:0.00000000 1st Qu.:0.0000000 1st Qu.:0.00000000
## Median :0.00000000 Median :0.0650000 Median :0.00000000
## Mean :0.03857466 Mean :0.1390304 Mean :0.01697588
## 3rd Qu.:0.00000000 3rd Qu.:0.1880000 3rd Qu.:0.00000000
## Max. :4.38500000 Max. :9.7520000 Max. :4.08100000
## charExclamation charDollar charHash
## Min. : 0.0000000 Min. :0.00000000 Min. : 0.00000000
## 1st Qu.: 0.0000000 1st Qu.:0.00000000 1st Qu.: 0.00000000
## Median : 0.0000000 Median :0.00000000 Median : 0.00000000
## Mean : 0.2690708 Mean :0.07581069 Mean : 0.04423821
## 3rd Qu.: 0.3150000 3rd Qu.:0.05200000 3rd Qu.: 0.00000000
## Max. :32.4780000 Max. :6.00300000 Max. :19.82900000
## capitalAve capitalLong capitalTotal
## Min. : 1.000000 Min. : 1.00000 Min. : 1.0000
## 1st Qu.: 1.588000 1st Qu.: 6.00000 1st Qu.: 35.0000
## Median : 2.276000 Median : 15.00000 Median : 95.0000
## Mean : 5.191515 Mean : 52.17279 Mean : 283.2893
## 3rd Qu.: 3.706000 3rd Qu.: 43.00000 3rd Qu.: 266.0000
## Max. :1102.500000 Max. :9989.00000 Max. :15841.0000
## type
## nonspam:2788
## spam :1813
##
##
##
##
table(spam$type)
##
## nonspam spam
## 2788 1813
index <- 1:nrow(spam)
testindex <- sample(index, trunc(length(index)/3))
testset <- spam[testindex,]
trainingset <- spam[-testindex,]
library(e1071)
model <- svm(type ~ ., data = trainingset, method = "C-classification", kernel = "radial", cost = 10, gamma = 0.1)
summary(model)
##
## Call:
## svm(formula = type ~ ., data = trainingset, method = "C-classification",
## kernel = "radial", cost = 10, gamma = 0.1)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 10
## gamma: 0.1
##
## Number of Support Vectors: 1534
##
## ( 638 896 )
##
##
## Number of Classes: 2
##
## Levels:
## nonspam spam
pred <- predict(model, testset)
table(pred, testset$type)
##
## pred nonspam spam
## nonspam 882 94
## spam 32 525
((885+508) / (885+107+33+508))
## [1] 0.9086757991
#Random forests Algorithm
library(randomForest)
fit <- randomForest(type ~ ., data=spam)
fit
##
## Call:
## randomForest(formula = type ~ ., data = spam)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 7
##
## OOB estimate of error rate: 4.78%
## Confusion matrix:
## nonspam spam class.error
## nonspam 2706 82 0.02941176471
## spam 138 1675 0.07611693326
#Hidden Markov Model
library(HMM)
hmm <- initHMM(c("Rainy","Sunny"), c('walk', 'shop', 'clean'),
c(.6,.4), matrix(c(.7,.3,.4,.6),2), matrix(c(.1,.4,.5,.6,.3,.1),3))
hmm
## $States
## [1] "Rainy" "Sunny"
##
## $Symbols
## [1] "walk" "shop" "clean"
##
## $startProbs
## Rainy Sunny
## 0.6 0.4
##
## $transProbs
## to
## from Rainy Sunny
## Rainy 0.7 0.4
## Sunny 0.3 0.6
##
## $emissionProbs
## symbols
## states walk shop clean
## Rainy 0.1 0.5 0.3
## Sunny 0.4 0.6 0.1
future <- forward(hmm, c("walk","shop","clean"))
future
## index
## states 1 2 3
## Rainy -2.813410717 -3.101092789 -4.139551155
## Sunny -1.832581464 -2.631089160 -5.096193182
#Blind signal separation
library(FactoMineR)
data(decathlon)
summary(decathlon)
## 100m Long.jump Shot.put High.jump
## Min. :10.44000 Min. :6.61 Min. :12.68000 Min. :1.850000
## 1st Qu.:10.85000 1st Qu.:7.03 1st Qu.:13.88000 1st Qu.:1.920000
## Median :10.98000 Median :7.30 Median :14.57000 Median :1.950000
## Mean :10.99805 Mean :7.26 Mean :14.47707 Mean :1.976829
## 3rd Qu.:11.14000 3rd Qu.:7.48 3rd Qu.:14.97000 3rd Qu.:2.040000
## Max. :11.64000 Max. :7.96 Max. :16.36000 Max. :2.150000
## 400m 110m.hurdle Discus
## Min. :46.81000 Min. :13.97000 Min. :37.92000
## 1st Qu.:48.93000 1st Qu.:14.21000 1st Qu.:41.90000
## Median :49.40000 Median :14.48000 Median :44.41000
## Mean :49.61634 Mean :14.60585 Mean :44.32561
## 3rd Qu.:50.30000 3rd Qu.:14.98000 3rd Qu.:46.07000
## Max. :53.20000 Max. :15.67000 Max. :51.65000
## Pole.vault Javeline 1500m
## Min. :4.200000 Min. :50.31000 Min. :262.1000
## 1st Qu.:4.500000 1st Qu.:55.27000 1st Qu.:271.0200
## Median :4.800000 Median :58.36000 Median :278.0500
## Mean :4.762439 Mean :58.31659 Mean :279.0249
## 3rd Qu.:4.920000 3rd Qu.:60.89000 3rd Qu.:285.1000
## Max. :5.400000 Max. :70.52000 Max. :317.0000
## Rank Points Competition
## Min. : 1.00000 Min. :7313.000 Decastar:13
## 1st Qu.: 6.00000 1st Qu.:7802.000 OlympicG:28
## Median :11.00000 Median :8021.000
## Mean :12.12195 Mean :8005.366
## 3rd Qu.:18.00000 3rd Qu.:8122.000
## Max. :28.00000 Max. :8893.000
head(decathlon)
## 100m Long.jump Shot.put High.jump 400m 110m.hurdle Discus
## SEBRLE 11.04 7.58 14.83 2.07 49.81 14.69 43.75
## CLAY 10.76 7.40 14.26 1.86 49.37 14.05 50.72
## KARPOV 11.02 7.30 14.77 2.04 48.37 14.09 48.95
## BERNARD 11.02 7.23 14.25 1.92 48.93 14.99 40.87
## YURKOV 11.34 7.09 15.19 2.10 50.42 15.31 46.26
## WARNERS 11.11 7.60 14.31 1.98 48.68 14.23 41.10
## Pole.vault Javeline 1500m Rank Points Competition
## SEBRLE 5.02 63.19 291.7 1 8217 Decastar
## CLAY 4.92 60.15 301.5 2 8122 Decastar
## KARPOV 4.92 50.31 300.2 3 8099 Decastar
## BERNARD 5.32 62.77 280.1 4 8067 Decastar
## YURKOV 4.72 63.44 276.4 5 8036 Decastar
## WARNERS 4.92 51.77 278.1 6 8030 Decastar
res.pca = PCA(decathlon[,1:10], scale.unit=TRUE, ncp=5, graph=T)