Part 1: Association Analysis

Question 1- Load Bob_Ross file into R to do an Association Anlaysis

#install.packages("arules") for association analysis
library(arules)
# Bob_Ross csv file loaded below
bob_ross <- read.csv("G:/Other computers/My Laptop/Documents/Richard 622 last semester/Week 1 and 2/Homework 1/bob_ross.csv")

Question 2- Remove all non-binary columns and make into a Matrix

#what does the data look like and what is non-binary
head(bob_ross)
##   EPISODE                 TITLE APPLE_FRAME AURORA_BOREALIS BARN BEACH BOAT
## 1  S01E01 "A WALK IN THE WOODS"           0               0    0     0    0
## 2  S01E02        "MT. MCKINLEY"           0               0    0     0    0
## 3  S01E03        "EBONY SUNSET"           0               0    0     0    0
## 4  S01E04         "WINTER MIST"           0               0    0     0    0
## 5  S01E05        "QUIET STREAM"           0               0    0     0    0
## 6  S01E06         "WINTER MOON"           0               0    0     0    0
##   BRIDGE BUILDING BUSHES CABIN CACTUS CIRCLE_FRAME CIRRUS CLIFF CLOUDS CONIFER
## 1      0        0      1     0      0            0      0     0      0       0
## 2      0        0      0     1      0            0      0     0      1       1
## 3      0        0      0     1      0            0      0     0      0       1
## 4      0        0      1     0      0            0      0     0      1       1
## 5      0        0      0     0      0            0      0     0      0       0
## 6      0        0      0     1      0            0      0     0      0       1
##   CUMULUS DECIDUOUS DIANE_ANDRE DOCK DOUBLE_OVAL_FRAME FARM FENCE FIRE
## 1       0         1           0    0                 0    0     0    0
## 2       0         0           0    0                 0    0     0    0
## 3       0         0           0    0                 0    0     1    0
## 4       0         0           0    0                 0    0     0    0
## 5       0         1           0    0                 0    0     0    0
## 6       0         0           0    0                 0    0     0    0
##   FLORIDA_FRAME FLOWERS FOG FRAMED GRASS GUEST HALF_CIRCLE_FRAME
## 1             0       0   0      0     1     0                 0
## 2             0       0   0      0     0     0                 0
## 3             0       0   0      0     0     0                 0
## 4             0       0   0      0     0     0                 0
## 5             0       0   0      0     0     0                 0
## 6             0       0   0      0     0     0                 0
##   HALF_OVAL_FRAME HILLS LAKE LAKES LIGHTHOUSE MILL MOON MOUNTAIN MOUNTAINS
## 1               0     0    0     0          0    0    0        0         0
## 2               0     0    0     0          0    0    0        1         0
## 3               0     0    0     0          0    0    0        1         1
## 4               0     0    1     0          0    0    0        1         0
## 5               0     0    0     0          0    0    0        0         0
## 6               0     0    1     0          0    0    1        1         1
##   NIGHT OCEAN OVAL_FRAME PALM_TREES PATH PERSON PORTRAIT RECTANGLE_3D_FRAME
## 1     0     0          0          0    0      0        0                  0
## 2     0     0          0          0    0      0        0                  0
## 3     0     0          0          0    0      0        0                  0
## 4     0     0          0          0    0      0        0                  0
## 5     0     0          0          0    0      0        0                  0
## 6     1     0          0          0    0      0        0                  0
##   RECTANGULAR_FRAME RIVER ROCKS SEASHELL_FRAME SNOW SNOWY_MOUNTAIN SPLIT_FRAME
## 1                 0     1     0              0    0              0           0
## 2                 0     0     0              0    1              1           0
## 3                 0     0     0              0    0              0           0
## 4                 0     0     0              0    0              1           0
## 5                 0     1     1              0    0              0           0
## 6                 0     0     0              0    1              1           0
##   STEVE_ROSS STRUCTURE SUN TOMB_FRAME TREE TREES TRIPLE_FRAME WATERFALL WAVES
## 1          0         0   0          0    1     1            0         0     0
## 2          0         0   0          0    1     1            0         0     0
## 3          0         1   1          0    1     1            0         0     0
## 4          0         0   0          0    1     1            0         0     0
## 5          0         0   0          0    1     1            0         0     0
## 6          0         1   0          0    1     1            0         0     0
##   WINDMILL WINDOW_FRAME WINTER WOOD_FRAMED
## 1        0            0      0           0
## 2        0            0      1           0
## 3        0            0      1           0
## 4        0            0      0           0
## 5        0            0      0           0
## 6        0            0      1           0
# Columns 1 and 2 are non-binary columns and will be removed from the dataset
bob_ross$EPISODE = NULL
bob_ross$TITLE= NULL

# Make the dataset a Matrix to make it work with association analysis
bob_ross <- as(as.matrix(bob_ross), "transactions")

Question 3- Perform association analysis to determine rules with a support of at least 30% and confidence of at least 90%

#Association analysis rules with 30% support and 90% confidence
bobross_rules <- apriori(bob_ross, parameter = list(sup = 0.3, conf = 0.9, target = "rules"))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.9    0.1    1 none FALSE            TRUE       5     0.3      1
##  maxlen target  ext
##      10  rules TRUE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 120 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[66 item(s), 403 transaction(s)] done [0.00s].
## sorting and recoding items ... [9 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [31 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
inspect(bobross_rules)
##      lhs                           rhs     support   confidence coverage 
## [1]  {RIVER}                    => {TREES} 0.3002481 0.9603175  0.3126551
## [2]  {RIVER}                    => {TREE}  0.3126551 1.0000000  0.3126551
## [3]  {GRASS}                    => {TREE}  0.3374690 0.9577465  0.3523573
## [4]  {LAKE}                     => {TREES} 0.3325062 0.9370629  0.3548387
## [5]  {LAKE}                     => {TREE}  0.3523573 0.9930070  0.3548387
## [6]  {MOUNTAIN}                 => {TREES} 0.3672457 0.9250000  0.3970223
## [7]  {MOUNTAIN}                 => {TREE}  0.3870968 0.9750000  0.3970223
## [8]  {DECIDUOUS}                => {TREES} 0.5136476 0.9118943  0.5632754
## [9]  {DECIDUOUS}                => {TREE}  0.5632754 1.0000000  0.5632754
## [10] {CONIFER}                  => {TREES} 0.5161290 0.9811321  0.5260546
## [11] {CONIFER}                  => {TREE}  0.5260546 1.0000000  0.5260546
## [12] {TREES}                    => {TREE}  0.8362283 1.0000000  0.8362283
## [13] {TREE}                     => {TREES} 0.8362283 0.9335180  0.8957816
## [14] {RIVER, TREES}             => {TREE}  0.3002481 1.0000000  0.3002481
## [15] {RIVER, TREE}              => {TREES} 0.3002481 0.9603175  0.3126551
## [16] {GRASS, TREES}             => {TREE}  0.3126551 1.0000000  0.3126551
## [17] {GRASS, TREE}              => {TREES} 0.3126551 0.9264706  0.3374690
## [18] {LAKE, TREES}              => {TREE}  0.3325062 1.0000000  0.3325062
## [19] {LAKE, TREE}               => {TREES} 0.3325062 0.9436620  0.3523573
## [20] {CONIFER, MOUNTAIN}        => {TREES} 0.3126551 0.9767442  0.3200993
## [21] {CONIFER, MOUNTAIN}        => {TREE}  0.3200993 1.0000000  0.3200993
## [22] {MOUNTAIN, TREES}          => {TREE}  0.3672457 1.0000000  0.3672457
## [23] {MOUNTAIN, TREE}           => {TREES} 0.3672457 0.9487179  0.3870968
## [24] {CLOUDS, TREES}            => {TREE}  0.3424318 1.0000000  0.3424318
## [25] {CLOUDS, TREE}             => {TREES} 0.3424318 0.9387755  0.3647643
## [26] {DECIDUOUS, TREES}         => {TREE}  0.5136476 1.0000000  0.5136476
## [27] {DECIDUOUS, TREE}          => {TREES} 0.5136476 0.9118943  0.5632754
## [28] {CONIFER, TREES}           => {TREE}  0.5161290 1.0000000  0.5161290
## [29] {CONIFER, TREE}            => {TREES} 0.5161290 0.9811321  0.5260546
## [30] {CONIFER, MOUNTAIN, TREES} => {TREE}  0.3126551 1.0000000  0.3126551
## [31] {CONIFER, MOUNTAIN, TREE}  => {TREES} 0.3126551 0.9767442  0.3200993
##      lift     count
## [1]  1.148392 121  
## [2]  1.116343 126  
## [3]  1.069174 136  
## [4]  1.120583 134  
## [5]  1.108537 142  
## [6]  1.106157 148  
## [7]  1.088435 156  
## [8]  1.090485 207  
## [9]  1.116343 227  
## [10] 1.173283 208  
## [11] 1.116343 212  
## [12] 1.116343 337  
## [13] 1.116343 337  
## [14] 1.116343 121  
## [15] 1.148392 121  
## [16] 1.116343 126  
## [17] 1.107916 126  
## [18] 1.116343 134  
## [19] 1.128474 134  
## [20] 1.168035 126  
## [21] 1.116343 129  
## [22] 1.116343 148  
## [23] 1.134520 148  
## [24] 1.116343 138  
## [25] 1.122631 138  
## [26] 1.116343 207  
## [27] 1.090485 207  
## [28] 1.116343 208  
## [29] 1.173283 208  
## [30] 1.116343 126  
## [31] 1.168035 126
# All rules have a lift above 1 which means the target response is more likely than average
# Bob Ross almost always painted trees but some of the types of landscapes that he painted with trees are Rivers, Grass, Lakes, and Mountains

Part 2: Predicting a Numeric Column of Data

Question 1 -. Load the nhanes_train csv file into R

#nhanes data loaded below
nhanes_train <- read.csv("G:/Other computers/My Laptop/Documents/Richard 622 last semester/Week 1 and 2/Homework 1/nhanes_train.csv")
# what does the data look like see below
head(nhanes_train)
##   Age Weight Height Pulse
## 1  80   97.6  175.8    58
## 2  48  120.7  185.9    56
## 3  20   46.8  163.6    78
## 4  18  100.2  176.9    84
## 5  46  112.2  180.5    90
## 6  63   79.3  182.8    66

Question 2- Predict a persons weight using a regression, decision tree, bagging, random forest, and boosting models to see which does the best job at making a prediction

Regression Model
regmodel <- lm(Weight ~., data = nhanes_train)
summary(regmodel)
## 
## Call:
## lm(formula = Weight ~ ., data = nhanes_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -43.860 -13.096  -3.099  10.227 116.080 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -112.64017    6.78932 -16.591  < 2e-16 ***
## Age            0.12471    0.02045   6.097 1.22e-09 ***
## Height         1.01324    0.03522  28.768  < 2e-16 ***
## Pulse          0.24685    0.03019   8.177 4.32e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 18.67 on 2867 degrees of freedom
## Multiple R-squared:  0.2311, Adjusted R-squared:  0.2303 
## F-statistic: 287.2 on 3 and 2867 DF,  p-value: < 2.2e-16
#all variables are significant based on P values
Decision Tree
# Libraries needed to run a decision tree below
library(rpart)
library(rpart.plot)

## Printing and plotting the tree
reg_treenhanes <- rpart(formula = Weight ~ ., data = nhanes_train)
prp(reg_treenhanes, digits = 4, extra = 1)

Bagging Model
#install.packages('ipred')
library(ipred)

#bagging model with 100 bootstrapped samples
nhanes_bag <- bagging(Weight ~ ., data = nhanes_train, nbagg = 100)
nhanes_bag
## 
## Bagging regression trees with 100 bootstrap replications 
## 
## Call: bagging.data.frame(formula = Weight ~ ., data = nhanes_train, 
##     nbagg = 100)
Random Forest Model
#install.packages('randomForest')
library(randomForest)

nhanes_rf <- randomForest(Weight ~., data = nhanes_train, importance = TRUE, ntree = 200)
Boosting Model
# install.packages("gbm")
library(gbm)

# Interaction depth is how many splits in each tree we want.
# Shrinkage determines how much each tree contributes to the prediction.
nhanes_boost <- gbm(formula = Weight ~., data = nhanes_train, distribution = "gaussian", n.trees = 200, shrinkage = 0.03, interaction.depth = 5)
summary(nhanes_boost)

##           var  rel.inf
## Height Height 65.62662
## Pulse   Pulse 17.42341
## Age       Age 16.94998
#height has the most influence when determining weight

Question 3- Which model above is the best at predicting?

# First the testing set needs to be loaded for prediction
nhanes_test <- read.csv("G:/Other computers/My Laptop/Documents/Richard 622 last semester/Week 1 and 2/Homework 1/nhanes_test.csv")

#Predictions based on: 
#Regression
predreg <- predict(regmodel, nhanes_test) 
predreg

#Decision Tree
nhanes_test_pred_tree = predict(reg_treenhanes, nhanes_test)
nhanes_test_pred_tree

#Bagging
nhanes_bag_pred <- predict(nhanes_bag, newdata = nhanes_test)

#Random Forest
nhanes_rf_pred <- predict(nhanes_rf, nhanes_test)
nhanes_rf_pred

#Boosting
nhanes_boost_pred <- predict(nhanes_boost, nhanes_test, n.trees = 200)
nhanes_boost_pred
#Regression
#MSE - Mean Squared Error
mean((predreg - nhanes_test$Weight)^2) #MSE= [1] 332.8215  
## [1] 332.8215
#Decision Tree
#MSE 
mean((nhanes_test_pred_tree - nhanes_test$Weight)^2) #MSE= [1] 334.6264  
## [1] 334.6264
#Bagging
mean((nhanes_test$Weight - nhanes_bag_pred)^2) #MSE=[1] 327.1046  
## [1] 327.1284
#Random Forest
mean((nhanes_test$Weight - nhanes_rf_pred)^2) #MSE=[1] 262.1942  
## [1] 260.3832
#Boosting
mean((nhanes_test$Weight - nhanes_boost_pred)^2) #MSE=[1] 314.9958  
## [1] 315.2021
#(Original run) The best model to predict weight is the Random Forest Model with an MSE of 262.1942

Part 3: Clustering

Question 1- Use R to compute the mean of each variable in the nhanes_train csv file

#mean for all variables is displayed below
summary(nhanes_train)
##       Age            Weight           Height          Pulse       
##  Min.   :16.00   Min.   : 39.30   Min.   :139.9   Min.   : 40.00  
##  1st Qu.:30.00   1st Qu.: 67.50   1st Qu.:162.2   1st Qu.: 64.00  
##  Median :44.00   Median : 79.40   Median :169.7   Median : 72.00  
##  Mean   :45.15   Mean   : 82.63   Mean   :169.5   Mean   : 72.32  
##  3rd Qu.:58.00   3rd Qu.: 94.65   3rd Qu.:176.7   3rd Qu.: 80.00  
##  Max.   :80.00   Max.   :223.00   Max.   :199.9   Max.   :122.00
  #Age            Weight           Height          Pulse       
# Mean   :45.15   Mean   : 82.63   Mean   :169.5   Mean   : 72.32  

Question 2- Perform k-means clustering on the data in the nhanes_train csv file. Create 4 distinct clusters

library(fpc)
data(nhanes_train)
fit <- kmeans(nhanes_train[, 1:4], 4)
fit
## K-means clustering with 4 clusters of sizes 847, 821, 716, 487
## 
## Cluster means:
##        Age    Weight   Height    Pulse
## 1 64.30224  73.19764 164.9505 68.76033
## 2 36.82095  88.26090 174.1502 71.74178
## 3 30.04888  63.92165 165.1246 75.13966
## 4 48.06571 117.06982 176.2801 75.34292
## 
## Clustering vector:
##    [1] 1 4 3 2 4 1 2 2 2 1 3 3 2 4 4 2 3 3 3 1 2 3 1 2 1 1 1 4 3 2 2 1 3 4 3 2 1
##   [38] 1 2 2 2 1 1 2 2 1 3 1 2 1 1 1 3 4 3 1 4 1 1 1 2 3 3 3 1 3 4 2 2 3 1 1 2 1
##   [75] 4 3 3 2 3 2 1 2 4 3 4 3 3 1 1 2 3 1 3 4 4 1 2 3 3 3 1 1 1 3 3 4 1 2 1 3 3
##  [112] 3 2 4 1 3 1 2 1 3 3 1 1 1 1 2 2 4 4 2 2 2 1 4 1 3 3 4 4 1 1 1 1 1 2 4 1 2
##  [149] 3 1 2 3 2 3 1 1 2 3 2 2 2 3 4 2 4 3 3 2 3 1 2 2 2 3 2 2 3 2 3 1 2 3 2 3 3
##  [186] 1 4 1 2 3 1 2 4 2 1 2 1 3 3 1 2 1 2 2 4 1 4 1 1 3 2 2 4 4 2 4 1 3 3 4 2 3
##  [223] 3 4 2 2 3 4 1 1 3 2 4 1 2 4 4 1 1 2 3 3 2 1 1 4 3 1 1 1 3 2 1 2 3 4 1 2 1
##  [260] 2 1 4 4 1 2 1 4 4 1 3 2 3 2 4 3 2 2 4 2 1 1 1 1 1 1 4 2 3 2 3 2 4 4 3 3 3
##  [297] 1 4 4 3 2 4 3 3 2 4 1 2 3 1 1 3 3 2 3 1 3 2 1 3 4 4 2 2 2 3 4 1 3 3 3 1 2
##  [334] 3 1 1 1 3 3 4 1 2 2 1 2 1 3 3 2 1 4 1 4 3 4 3 4 3 4 2 1 1 3 2 2 3 1 3 1 3
##  [371] 1 3 2 3 2 3 1 2 3 1 2 1 2 2 1 3 2 3 4 3 1 1 1 3 1 4 2 2 3 2 2 2 3 2 1 2 1
##  [408] 1 4 3 2 2 1 2 1 4 1 1 3 1 1 1 1 2 2 3 1 4 1 2 4 3 3 3 2 3 3 4 2 3 1 3 2 2
##  [445] 3 1 3 2 3 4 1 2 1 1 2 1 4 1 3 3 2 2 2 4 4 2 2 1 2 1 2 1 2 4 2 2 4 2 4 2 3
##  [482] 2 1 3 3 1 4 1 2 2 2 3 2 3 1 3 2 2 1 3 1 4 4 2 4 2 2 2 3 1 1 2 1 3 1 1 2 4
##  [519] 2 3 3 3 2 2 1 1 3 4 3 2 1 1 1 1 2 4 2 2 3 1 4 4 1 2 3 2 4 3 2 3 1 2 2 1 2
##  [556] 4 1 3 4 1 2 2 1 1 2 2 3 1 1 1 4 3 1 4 3 2 4 3 1 2 3 3 4 2 1 3 3 1 4 4 1 3
##  [593] 2 2 1 2 1 1 1 3 3 2 3 1 1 3 1 4 3 2 1 3 2 1 1 2 3 2 1 3 3 2 3 2 2 2 4 2 1
##  [630] 1 1 2 1 2 4 1 1 4 4 2 1 4 1 3 3 4 2 2 1 2 3 4 1 3 1 2 2 3 2 1 2 3 3 3 3 1
##  [667] 2 2 3 3 3 3 4 1 4 3 1 2 4 2 4 2 2 4 4 3 3 3 1 2 1 4 2 2 1 1 3 4 1 2 4 2 3
##  [704] 4 4 4 1 1 1 1 3 3 1 1 3 3 3 3 2 1 1 1 1 2 3 3 2 1 4 1 4 4 3 2 3 1 1 2 4 2
##  [741] 3 1 4 3 3 1 3 3 1 2 3 2 2 1 1 1 1 1 2 2 3 2 2 1 3 2 3 2 3 1 2 4 4 4 3 2 4
##  [778] 2 1 2 3 2 1 1 2 1 2 4 3 3 4 3 3 1 2 1 1 2 1 4 3 2 1 2 1 3 2 1 1 3 3 4 2 4
##  [815] 4 2 4 2 3 2 1 3 2 3 2 2 1 4 2 1 1 4 1 2 3 1 1 3 2 2 2 2 3 1 2 1 3 2 2 1 3
##  [852] 4 1 2 3 2 3 2 2 3 1 4 1 3 3 1 3 1 3 1 1 4 4 2 1 2 1 1 2 3 2 1 4 1 2 2 2 2
##  [889] 4 1 2 2 4 1 3 2 4 1 2 4 1 4 1 2 2 1 1 3 1 2 2 3 2 2 2 2 3 2 3 1 3 2 4 2 3
##  [926] 3 4 1 1 3 3 2 1 3 4 1 2 2 1 1 1 2 2 1 2 1 2 3 4 2 3 3 1 1 1 2 4 2 1 2 3 3
##  [963] 3 4 3 3 3 4 2 2 1 2 1 3 2 4 3 2 4 4 3 4 1 4 2 1 3 1 3 1 3 2 3 4 3 1 4 2 4
## [1000] 1 3 2 1 4 1 2 3 4 1 2 4 3 3 1 2 2 2 2 1 3 2 4 4 1 2 1 3 4 3 4 1 3 3 4 4 3
## [1037] 4 2 3 1 4 3 3 2 3 1 4 3 1 3 3 3 3 1 4 1 2 2 2 2 2 3 2 1 1 1 3 3 3 1 2 3 3
## [1074] 2 4 1 2 4 1 1 1 4 4 3 2 4 1 3 3 3 3 1 1 4 4 2 3 2 2 3 2 1 2 1 1 2 3 4 2 2
## [1111] 2 3 1 2 4 1 4 4 3 1 4 4 4 2 2 1 3 2 2 1 1 2 3 2 3 3 2 1 4 3 4 2 2 3 1 3 2
## [1148] 1 3 1 1 1 3 3 3 1 1 2 2 4 1 3 3 2 3 4 4 3 1 2 2 4 4 1 1 4 2 4 1 3 4 3 3 1
## [1185] 1 3 1 2 4 4 2 1 2 2 4 4 1 2 2 1 2 1 1 3 2 3 2 1 3 3 4 2 2 2 2 1 2 1 1 4 2
## [1222] 2 2 3 1 1 3 1 3 3 1 4 3 3 4 1 3 4 2 2 2 1 1 3 4 4 3 4 1 1 3 3 3 1 1 3 4 4
## [1259] 4 1 1 1 2 1 3 3 3 1 1 2 1 2 2 3 2 2 1 2 2 1 1 1 1 1 3 2 1 2 3 3 3 3 3 3 3
## [1296] 3 4 2 2 4 2 4 2 1 1 1 1 2 1 1 4 3 4 2 1 2 4 3 1 1 4 2 3 2 3 1 2 1 1 4 4 4
## [1333] 1 1 3 2 4 1 1 4 1 1 1 4 2 2 1 3 4 3 2 3 1 1 2 1 4 2 3 2 1 1 1 4 1 2 1 3 2
## [1370] 2 2 1 3 1 1 2 3 1 3 4 2 3 1 1 1 1 1 3 3 1 2 1 2 2 4 3 1 2 1 1 1 2 1 3 4 3
## [1407] 1 4 2 1 4 2 1 4 2 4 3 2 1 1 3 4 2 1 4 2 2 1 1 1 1 4 1 4 4 3 3 2 1 1 1 3 2
## [1444] 4 2 3 3 1 1 1 4 1 1 3 4 2 3 3 1 2 2 2 1 3 4 3 3 2 3 3 3 3 2 1 4 2 3 3 2 1
## [1481] 1 1 4 4 3 4 2 1 2 3 3 1 4 2 3 3 2 1 4 3 1 1 4 2 1 1 1 4 2 3 1 3 2 2 1 3 1
## [1518] 4 2 1 1 3 1 4 4 1 1 1 3 1 4 2 3 2 2 2 3 1 2 2 1 2 4 3 3 4 3 1 1 2 2 2 3 2
## [1555] 4 1 1 3 2 1 4 1 1 1 3 3 1 1 2 4 2 3 4 4 3 2 1 2 2 4 1 3 4 2 2 3 2 1 3 2 1
## [1592] 1 2 3 2 3 1 3 4 2 1 2 1 1 1 2 1 1 1 2 2 2 4 4 2 1 4 4 1 2 3 2 2 1 2 2 4 1
## [1629] 2 1 2 1 2 3 3 3 2 4 1 3 2 2 1 3 2 2 1 1 2 1 2 3 2 1 4 2 2 3 1 2 2 1 4 1 2
## [1666] 1 1 2 3 2 4 3 3 3 4 1 2 3 4 4 2 3 3 2 3 2 1 1 3 2 4 1 1 1 3 4 4 3 4 4 2 2
## [1703] 4 3 1 1 2 4 3 3 3 2 4 1 1 1 4 1 4 2 2 1 4 1 4 3 2 2 3 2 1 4 1 3 1 3 2 3 4
## [1740] 2 4 4 3 3 1 2 2 1 2 2 2 1 4 1 3 2 2 2 3 1 3 1 1 3 4 2 4 2 3 3 2 1 3 3 3 3
## [1777] 1 4 1 4 1 2 3 2 2 1 2 3 2 1 2 1 2 2 2 3 4 3 1 2 3 1 1 1 1 3 1 4 1 1 1 4 2
## [1814] 1 3 2 2 3 3 2 1 3 3 1 4 1 2 1 1 1 1 1 2 1 2 3 3 1 2 1 4 2 4 2 3 1 3 3 2 3
## [1851] 4 2 1 1 2 1 1 1 1 3 2 4 1 1 2 1 2 3 2 3 2 1 1 4 1 2 3 1 1 1 3 3 2 1 2 2 2
## [1888] 2 3 3 2 1 2 4 3 1 4 1 4 2 2 1 2 1 1 2 3 3 2 4 3 4 3 4 1 2 1 3 1 4 3 2 3 4
## [1925] 2 2 4 1 4 2 1 4 4 1 1 3 1 4 2 2 3 3 4 2 4 3 4 3 4 4 1 1 3 2 2 2 3 1 1 1 2
## [1962] 1 2 3 1 2 3 1 1 1 3 3 3 2 2 1 1 3 1 4 3 2 2 2 1 4 3 3 1 1 3 1 3 2 2 4 2 3
## [1999] 3 2 3 3 1 4 2 2 4 4 2 2 2 1 4 1 3 2 1 2 3 3 2 3 1 2 1 4 4 2 4 3 3 4 2 4 3
## [2036] 1 1 3 2 2 1 2 2 2 2 3 4 4 3 3 2 3 2 4 3 3 4 4 3 3 4 1 1 3 2 4 3 2 4 2 2 4
## [2073] 3 3 2 3 4 4 1 4 3 1 1 3 2 4 4 4 3 1 1 2 3 2 3 4 2 1 4 2 4 2 1 1 2 1 3 2 2
## [2110] 1 3 2 2 3 1 3 2 4 4 3 4 3 3 2 3 2 4 1 1 2 3 1 2 1 3 2 2 2 3 3 1 3 1 3 1 3
## [2147] 3 4 3 2 4 1 4 2 3 3 2 1 1 2 3 4 4 1 2 2 1 1 4 2 2 3 4 4 3 4 1 1 4 1 1 1 2
## [2184] 2 2 2 3 1 1 4 4 3 2 2 1 4 2 3 1 3 1 4 4 2 3 1 1 2 1 4 3 4 2 2 4 2 4 3 3 4
## [2221] 4 3 1 4 3 3 4 3 4 2 2 2 2 3 2 2 4 1 2 2 1 1 4 4 3 1 1 3 3 3 1 1 3 3 2 1 2
## [2258] 3 4 3 3 4 1 1 2 1 4 4 1 3 4 4 3 1 2 2 3 2 2 3 1 3 2 4 2 3 4 3 4 2 4 1 4 2
## [2295] 2 3 1 2 3 1 1 3 1 4 2 2 3 1 2 3 3 1 3 3 3 1 3 4 2 4 1 1 3 2 1 1 2 4 1 3 1
## [2332] 4 2 1 4 2 1 1 1 4 3 3 1 1 2 2 3 2 3 2 1 3 3 2 1 1 1 2 1 2 1 1 4 3 1 3 1 1
## [2369] 4 3 2 2 1 2 4 4 4 2 1 2 3 1 4 1 2 4 1 1 1 3 3 3 2 2 4 4 2 1 1 1 2 2 3 1 3
## [2406] 3 3 3 2 1 3 2 1 3 2 2 2 2 1 2 2 2 2 3 3 3 1 3 4 4 4 4 3 1 3 3 2 2 2 3 1 3
## [2443] 1 2 1 2 1 1 2 3 3 1 2 4 2 1 3 1 1 3 3 4 2 3 3 2 4 2 2 2 3 3 4 4 3 1 2 3 2
## [2480] 2 2 4 3 2 2 2 3 2 4 3 4 3 2 1 1 4 2 1 2 1 3 4 2 4 3 2 3 2 3 4 3 2 1 2 4 1
## [2517] 1 1 2 1 1 1 3 4 2 1 1 3 3 2 2 3 1 4 4 3 2 3 2 4 1 3 2 1 3 1 2 2 2 2 3 1 3
## [2554] 1 1 1 3 2 1 3 4 3 1 3 3 4 4 3 4 2 4 4 4 3 2 1 3 1 2 2 1 2 4 2 1 1 4 4 4 3
## [2591] 1 1 2 1 1 4 1 2 2 1 2 1 2 2 3 3 1 1 2 1 2 2 4 1 3 2 2 2 3 3 1 2 1 2 1 2 2
## [2628] 2 1 4 1 4 2 3 2 1 2 3 1 2 3 1 1 2 1 4 1 3 4 1 2 3 2 2 2 4 3 1 2 1 1 1 3 3
## [2665] 3 4 1 2 4 4 2 3 1 3 1 3 3 4 2 2 4 1 2 1 1 2 3 1 3 1 1 2 4 2 4 1 4 3 3 3 1
## [2702] 2 4 3 1 1 1 4 1 2 1 1 2 2 4 2 2 3 1 3 2 1 1 1 1 1 4 3 2 3 1 4 4 3 2 1 1 4
## [2739] 2 1 2 2 3 4 2 4 2 4 2 4 4 4 1 2 1 2 3 2 3 1 1 1 1 3 3 3 1 1 1 4 2 1 2 4 3
## [2776] 1 3 3 2 2 1 2 2 2 3 1 2 2 2 1 1 3 1 4 1 4 1 1 2 3 1 2 3 3 1 4 4 2 1 2 2 4
## [2813] 3 3 2 1 1 3 3 3 1 2 2 1 3 2 4 4 2 2 1 3 4 2 3 1 2 2 1 2 2 1 1 4 1 2 2 2 2
## [2850] 2 4 1 4 1 4 4 1 2 1 1 3 3 3 4 3 1 1 3 3 1 4
## 
## Within cluster sum of squares by cluster:
## [1] 370405.2 317006.3 262863.8 365425.0
##  (between_SS / total_SS =  54.2 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
#K-means clustering with 4 clusters of sizes 717, 685, 444, 1025

#Cluster means:
#       Age    Weight   Height    Pulse
#1 62.96513  68.54114 162.5576 69.59554
#2 55.77664  95.28380 174.8207 68.92263
#3 34.44595 114.59167 175.8622 78.14414
#4 30.21463  70.19454 168.1748 73.97463

#Compared to the mean of all data

  #Age            Weight           Height          Pulse       
# Mean   :45.15   Mean   : 82.63   Mean   :169.5   Mean   : 72.32 

#Looking at the original data: age- clusters 1 and 2 are above average and cluster 3 and 4 are below average
#Weight: Clusters 1 and 4 are below average and clusters 2 and 3 are above average
#Height: Clusters 1 and 4 are below average and clusters 2 and 3 are above average
#Pulse:  Clusters 1 and 2 are below average and clusters 3 and 4 are above average
#b) Determine which cluster the data in row 2000 was placed in
fit$cluster[2000]
## [1] 2
#originally 2000 was placed in row 4 ([1] 4)