HOMEWORK 7: MODELLING THE STRENGTH OF CONCRETE.
- Read in the data.
concrete <- read.csv("http://www.sci.csueastbay.edu/~esuess/classes/Statistics_6620/Presentations/ml11/concrete.csv")
- Examine the structure of the data.
str(concrete)
## 'data.frame': 1030 obs. of 9 variables:
## $ cement : num 141 169 250 266 155 ...
## $ slag : num 212 42.2 0 114 183.4 ...
## $ ash : num 0 124.3 95.7 0 0 ...
## $ water : num 204 158 187 228 193 ...
## $ superplastic: num 0 10.8 5.5 0 9.1 0 0 6.4 0 9 ...
## $ coarseagg : num 972 1081 957 932 1047 ...
## $ fineagg : num 748 796 861 670 697 ...
## $ age : int 28 14 28 28 28 90 7 56 28 28 ...
## $ strength : num 29.9 23.5 29.2 45.9 18.3 ...
- Customize the normalization function.
normalize <- function(x) {
return((x - min(x)) / (max(x) - min(x)))
}
- Appling the normalization function to entire data frame.
concrete_norm <- as.data.frame(lapply(concrete, normalize))
- Confirming that the range is now between 0 and 1.
summary(concrete_norm$strength)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.2664 0.4001 0.4172 0.5457 1.0000
- Comparing the original minimum and maximum.
summary(concrete$strength)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.33 23.71 34.44 35.82 46.14 82.60
- Fitting the (training and test data) model by splitting the data frames.
concrete_train <- concrete_norm[1:773, ]
concrete_test <- concrete_norm[774:1030, ]
- Train the neuralnet model.
library(neuralnet)
## Warning: package 'neuralnet' was built under R version 3.3.3
- Creating a simple ANN with only a single hidden neuron.
- To guarantee replaucible results, we set a seed, which is essential when we try to optimize a function that involves randomly generated numbers (e.g. in simulation based estimation). If otherwise, the variation due to drawing different random numbers will likely cause the optimization algorithm to fail.
set.seed(12345)
concrete_model <- neuralnet(formula = strength ~ cement + slag +
ash + water + superplastic +
coarseagg + fineagg + age,
data = concrete_train)
Visualizing the network topology.
plot(concrete_model)
- Here we see the connection between the variables and how the contribute to the strength of the concrete. There’s an error rate of 5.077438, with 4882 steps. Should we add nore steps, there might be a reduced rate of the error.
- Getting an alternative visual plot.
library(NeuralNetTools)
## Warning: package 'NeuralNetTools' was built under R version 3.3.3
plotnet
## function (mod_in, ...)
## UseMethod("plotnet")
## <environment: namespace:NeuralNetTools>
par(mar = numeric(4), family = 'serif')
plotnet(concrete_model, alpha = 0.6)
- Obtaining model results.
model_results <- compute(concrete_model, concrete_test[1:8])
- Obtaining predicted strength values.
predicted_strength <- model_results$net.result
- Examining the correlation between predicted and actual values. This value will be higher than the one stated in the textbook (0.7170368646), thus proving a stronger correlation.
cor(predicted_strength, concrete_test$strength)
## [,1]
## [1,] 0.8064655576
- Producing actual predictions.
head(predicted_strength)
## [,1]
## 774 0.3258991537
## 775 0.4677425372
## 776 0.2370268181
## 777 0.6718811029
## 778 0.4663428766
## 779 0.4685272270
concrete_train_original_strength <- concrete[1:773,"strength"]
strength_min <- min(concrete_train_original_strength)
strength_max <- max(concrete_train_original_strength)
head(concrete_train_original_strength)
## [1] 29.89 23.51 29.22 45.85 18.29 21.86
- Customizing the normalization function.
unnormalize <- function(x, min, max) {
return( (max - min)*x + min )
}
strength_pred <- unnormalize(predicted_strength, strength_min, strength_max)
strength_pred
## [,1]
## 774 28.212910787
## 775 39.478112301
## 776 21.154669896
## 777 55.690797192
## 778 39.366951260
## 779 39.540432369
## 780 39.928033889
## 781 49.040354523
## 782 27.907103923
## 783 20.321053693
## 784 7.266385231
## 785 46.876247191
## 786 48.856855398
## 787 46.703583933
## 788 45.448376324
## 789 51.065068074
## 790 43.680671652
## 791 28.193980938
## 792 28.827103782
## 793 18.345241127
## 794 18.252462770
## 795 15.643701142
## 796 17.801483462
## 797 23.682825829
## 798 13.976073367
## 799 12.194355970
## 800 33.449926982
## 801 25.088099888
## 802 52.607784390
## 803 24.433849292
## 804 34.735202333
## 805 44.689978758
## 806 43.020408989
## 807 33.080435982
## 808 34.849702382
## 809 47.563249060
## 810 6.938091659
## 811 29.327314842
## 812 52.193794638
## 813 18.724170879
## 814 23.403183011
## 815 31.805906354
## 816 26.771592010
## 817 54.848833698
## 818 26.325671398
## 819 18.271643621
## 820 21.589603938
## 821 19.723934696
## 822 24.236493867
## 823 26.151069034
## 824 44.930065559
## 825 45.984424330
## 826 36.569851275
## 827 18.383419764
## 828 37.547970377
## 829 44.080234009
## 830 17.365663875
## 831 49.764429240
## 832 23.644245310
## 833 18.886834859
## 834 46.263011829
## 835 54.475179301
## 836 20.727657800
## 837 50.093261598
## 838 35.938877950
## 839 55.151231281
## 840 43.539564013
## 841 20.696074238
## 842 44.549415622
## 843 24.482322674
## 844 16.201399197
## 845 42.375778976
## 846 45.270976016
## 847 44.638527605
## 848 10.311700822
## 849 46.304752045
## 850 19.307530759
## 851 55.737413630
## 852 24.793017815
## 853 23.814544656
## 854 33.546539494
## 855 42.695335181
## 856 49.811424381
## 857 48.652348988
## 858 52.638858641
## 859 53.166049496
## 860 24.183085073
## 861 41.825425595
## 862 54.927075804
## 863 51.251536938
## 864 53.452210772
## 865 49.787431937
## 866 29.885878058
## 867 34.712601224
## 868 44.396238610
## 869 15.761370235
## 870 51.046835680
## 871 9.046006076
## 872 18.467300274
## 873 47.763396508
## 874 16.520661292
## 875 46.236355997
## 876 44.008915499
## 877 15.788252948
## 878 35.338644057
## 879 55.721030895
## 880 26.239781748
## 881 54.219040311
## 882 31.887794120
## 883 28.775965442
## 884 21.828939255
## 885 49.764429240
## 886 46.302614755
## 887 44.990113139
## 888 43.872794257
## 889 33.613802430
## 890 49.821748527
## 891 22.489345534
## 892 54.731284069
## 893 53.523149545
## 894 48.268894377
## 895 39.425593582
## 896 25.287008255
## 897 34.585350422
## 898 43.126060379
## 899 46.312025009
## 900 43.287860282
## 901 54.227588359
## 902 55.759561064
## 903 55.126429340
## 904 41.980980168
## 905 23.770483604
## 906 31.495206543
## 907 30.872026632
## 908 45.319018412
## 909 48.325099603
## 910 46.844044826
## 911 16.395384725
## 912 49.033749656
## 913 35.609382272
## 914 53.447559068
## 915 28.490260269
## 916 26.097392370
## 917 34.553462242
## 918 18.034082548
## 919 46.017644897
## 920 55.756594314
## 921 19.113094271
## 922 45.880359504
## 923 33.667746328
## 924 15.840638297
## 925 15.250768816
## 926 25.967077829
## 927 49.040327527
## 928 20.250009925
## 929 53.463888607
## 930 39.712945454
## 931 18.635100239
## 932 43.052489413
## 933 36.781496963
## 934 47.367089782
## 935 36.459125460
## 936 50.202821978
## 937 49.768635382
## 938 47.960542410
## 939 23.039593502
## 940 44.787449396
## 941 22.994965622
## 942 52.115231571
## 943 30.231029721
## 944 46.490832321
## 945 13.518827249
## 946 47.463311002
## 947 33.652021124
## 948 36.959189963
## 949 42.675567254
## 950 43.839085507
## 951 55.441831705
## 952 55.730302275
## 953 42.495361138
## 954 45.995593485
## 955 14.559033517
## 956 53.889321097
## 957 47.638332031
## 958 55.756976692
## 959 14.065750036
## 960 43.086479175
## 961 27.286788837
## 962 21.414631033
## 963 14.686461194
## 964 49.955191984
## 965 41.489814802
## 966 50.757732386
## 967 36.406258094
## 968 50.903995407
## 969 18.617655592
## 970 42.779147183
## 971 27.548360275
## 972 55.198346170
## 973 21.614221641
## 974 43.774007128
## 975 33.582782000
## 976 34.494050231
## 977 31.427147694
## 978 30.748169174
## 979 41.713472991
## 980 52.745480397
## 981 48.698035754
## 982 48.463177370
## 983 39.479649564
## 984 43.545565848
## 985 51.163772080
## 986 55.097430829
## 987 16.960833436
## 988 21.228459298
## 989 16.372967444
## 990 45.699914067
## 991 40.015149409
## 992 54.915041504
## 993 52.363061591
## 994 46.703583933
## 995 22.971496748
## 996 55.757710774
## 997 23.665277416
## 998 30.203783047
## 999 38.388908212
## 1000 31.819252685
## 1001 44.736636088
## 1002 20.515435999
## 1003 43.646913655
## 1004 53.414481990
## 1005 24.202915410
## 1006 49.321113911
## 1007 15.705157900
## 1008 28.509942443
## 1009 51.916787243
## 1010 27.794472365
## 1011 15.938227741
## 1012 31.463715541
## 1013 44.909197894
## 1014 48.094743404
## 1015 29.573686435
## 1016 32.339635939
## 1017 21.638918442
## 1018 54.759105089
## 1019 20.858741569
## 1020 30.157137857
## 1021 10.357941496
## 1022 25.792662773
## 1023 17.857069663
## 1024 12.193204624
## 1025 27.194717549
## 1026 17.787728300
## 1027 52.970583241
## 1028 41.241771843
## 1029 55.704290534
## 1030 46.250018028
- Creating a more complex neural network topology with five hidden neurons. *This will result in more steps..**
- We set a seed to guarante replausible results.
set.seed(12345)
- The commonly used link is the logit, with its inverse being the logistic, which we will use in this second model. This is best preferred for a binomial model.
concrete_model2 <- neuralnet(strength ~ cement + slag +
ash + water + superplastic +
coarseagg + fineagg + age,
data = concrete_train, hidden = 5, act.fct = "logistic")
- Plotting the network. With the second model and much more steps taken, the error rate has reduced from 5.077438 to 1.626684, making it a much better model for the predicion of the strength.
plot(concrete_model2)
- Plotting using ‘plotnet’.
par(mar = numeric(4), family = 'serif')
plotnet(concrete_model2, alpha = 0.6)
- Evaluating the results.
model_results2 <- compute(concrete_model2, concrete_test[1:8])
predicted_strength2 <- model_results2$net.result
- We get a higher correlation in the second model compared to the first one. Therefore this is a good model for our data.
cor(predicted_strength2, concrete_test$strength)
## [,1]
## [1,] 0.9244533426
- Trying a different activation function: a complex neural network topology with five hidden neurons.
set.seed(12345)
concrete_model2 <- neuralnet(strength ~ cement + slag +
ash + water + superplastic +
coarseagg + fineagg + age,
data = concrete_train, hidden = 5, act.fct = "tanh")
- Evaluating the results.
model_results2 <- compute(concrete_model2, concrete_test[1:8])
predicted_strength2 <- model_results2$net.result
- We see a strong correlation here.
cor(predicted_strength2, concrete_test$strength)
## [,1]
## [1,] 0.5741729322
library(h2o)
## Warning: package 'h2o' was built under R version 3.3.3
##
## ----------------------------------------------------------------------
##
## Your next step is to start H2O:
## > h2o.init()
##
## For H2O package documentation, ask for help:
## > ??h2o
##
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit http://docs.h2o.ai
##
## ----------------------------------------------------------------------
##
## Attaching package: 'h2o'
## The following objects are masked from 'package:stats':
##
## cor, sd, var
## The following objects are masked from 'package:base':
##
## %*%, %in%, &&, ||, apply, as.factor, as.numeric, colnames,
## colnames<-, ifelse, is.character, is.factor, is.numeric, log,
## log10, log1p, log2, round, signif, trunc
h2o.init(nthreads=4, max_mem_size="2G")
## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 2 hours 58 minutes
## H2O cluster version: 3.10.4.6
## H2O cluster version age: 27 days
## H2O cluster name: H2O_started_from_R_annmo_fgl344
## H2O cluster total nodes: 1
## H2O cluster total memory: 1.61 GB
## H2O cluster total cores: 4
## H2O cluster allowed cores: 4
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## R Version: R version 3.3.2 (2016-10-31)
h2o.removeAll() ## clean slate - just in case the cluster was already running
h2o.init()
## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 2 hours 58 minutes
## H2O cluster version: 3.10.4.6
## H2O cluster version age: 27 days
## H2O cluster name: H2O_started_from_R_annmo_fgl344
## H2O cluster total nodes: 1
## H2O cluster total memory: 1.61 GB
## H2O cluster total cores: 4
## H2O cluster allowed cores: 4
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## R Version: R version 3.3.2 (2016-10-31)
- Feeding in the data.
concrete.hex <- h2o.importFile("http://www.sci.csueastbay.edu/~esuess/classes/Statistics_6620/Presentations/ml11/concrete.csv")
##
|
| | 0%
|
|=================================================================| 100%
- Getting the summary of the data.
summary(concrete.hex)
## Warning in summary.H2OFrame(concrete.hex): Approximated quantiles
## computed! If you are interested in exact quantiles, please pass the
## `exact_quantiles=TRUE` parameter.
## cement slag ash water
## Min. :102.0 Min. : 0.00 Min. : 0.00 Min. :121.8
## 1st Qu.:192.1 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.:164.9
## Median :272.6 Median : 21.92 Median : 0.00 Median :184.9
## Mean :281.2 Mean : 73.90 Mean : 54.19 Mean :181.6
## 3rd Qu.:349.9 3rd Qu.:142.68 3rd Qu.:118.26 3rd Qu.:191.9
## Max. :540.0 Max. :359.40 Max. :200.10 Max. :247.0
## superplastic coarseagg fineagg age
## Min. : 0.000 Min. : 801.0 Min. :594.0 Min. : 1.00
## 1st Qu.: 0.000 1st Qu.: 931.7 1st Qu.:730.8 1st Qu.: 7.00
## Median : 6.376 Median : 967.8 Median :779.1 Median : 28.00
## Mean : 6.205 Mean : 972.9 Mean :773.6 Mean : 45.66
## 3rd Qu.:10.175 3rd Qu.:1029.1 3rd Qu.:824.0 3rd Qu.: 56.00
## Max. :32.200 Max. :1145.0 Max. :992.6 Max. :365.00
## strength
## Min. : 2.33
## 1st Qu.:23.68
## Median :34.40
## Mean :35.82
## 3rd Qu.:46.10
## Max. :82.60
splits <- h2o.splitFrame(concrete.hex, 0.75, seed=1234)
dl <- h2o.deeplearning(x=1:8,y="strength",training_frame=splits[[1]],activation = "Tanh", hidden = c(200,200), distribution = "gaussian")
##
|
| | 0%
|
|============= | 20%
|
|========================================================== | 90%
|
|=================================================================| 100%
dl.predict <- h2o.predict (dl, splits[[2]])
##
|
| | 0%
|
|=================================================================| 100%
cor(as.vector(dl.predict), as.vector(splits[[2]]$strength))
## [1] 0.9112169447
dl@parameters
## $model_id
## [1] "DeepLearning_model_R_1495667882195_5"
##
## $training_frame
## [1] "RTMP_sid_a748_2"
##
## $activation
## [1] "Tanh"
##
## $seed
## [1] 8908975863228854272
##
## $distribution
## [1] "gaussian"
##
## $x
## [1] "cement" "slag" "ash" "water"
## [5] "superplastic" "coarseagg" "fineagg" "age"
##
## $y
## [1] "strength"
h2o.performance(dl)
## H2ORegressionMetrics: deeplearning
## ** Reported on training data. **
## ** Metrics reported on full training frame **
##
## MSE: 44.87359936
## RMSE: 6.69877596
## MAE: 5.415179694
## RMSLE: 0.2430829426
## Mean Residual Deviance : 44.87359936
h2o.shutdown(prompt = TRUE)
## Are you sure you want to shutdown the H2O instance running at http://localhost:54321/ (Y/N)?