## Warning: package 'class' was built under R version 3.6.3

1.

Exploratory analysis

str(women)

## 'data.frame':    15 obs. of  2 variables:
##  $ height: num  58 59 60 61 62 63 64 65 66 67 ...
##  $ weight: num  115 117 120 123 126 129 132 135 139 142 ...

The height variable was measured in inch and weight was measured in pounds

summary(women)

##      height         weight     
##  Min.   :58.0   Min.   :115.0  
##  1st Qu.:61.5   1st Qu.:124.5  
##  Median :65.0   Median :135.0  
##  Mean   :65.0   Mean   :136.7  
##  3rd Qu.:68.5   3rd Qu.:148.0  
##  Max.   :72.0   Max.   :164.0

The mean and the meadian of height have the same value. The value of mean and meadian of weight are close (1.7 lbs different)

It seems the distribution of height and weight are both symmetrical.

boxplot(women$height, main = "Boxplot of height")

boxplot(women$weight, main = "Boxplot of weight")

plot(women$height, women$weight, xlab = "Height", ylab = "Weight", pch =19)

Based on the plot, there might be a correlation between height and weight.

2.

Hypothesis: There is a positive correlation between women height and their weight.

cor.test(women$height,women$weight)

## 
##  Pearson's product-moment correlation
## 
## data:  women$height and women$weight
## t = 37.855, df = 13, p-value = 1.091e-14
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9860970 0.9985447
## sample estimates:
##       cor 
## 0.9954948

We have \(p-value = 1.091^{-14} (< 0.05)\) and \(cor.coeff = 0.9954948 \approx 1\)

Based on Pearson test, there is a strong positive correlation between the height and weight of women.

plot(women$height, women$weight, xlab = "Height", ylab = "Weight", pch =19, main = "Relationship of Height and Weight")
abline(lm(women$weight~women$height))

3.

Based on the graph in question 2, I can build linear regression to predict the weight of Sandra Elaine Allen using her height.

Divide the dataset into 2 smaller datasets randomly (w1 and w2)

set.seed(011)
w1 <- women[sample(nrow(women), 8), ]
w2 <- women[ !(women$height %in% w1$height), ]

Use w1 to build the model

lm(w1$weight~w1$height)

## 
## Call:
## lm(formula = w1$weight ~ w1$height)
## 
## Coefficients:
## (Intercept)    w1$height  
##     -62.274        3.042

Model: (W for weight and H for height)

\(W = -62.274 + 3.042 \times H\)

Test the model using w2:

W <- rep(0,7)
 for (i in 1:7) {W[i] <- w2$height[i]*3.042 - 62.274
}
W

## [1] 120.246 132.414 144.582 147.624 150.666 153.708 156.750

Error of the model:

dif <- rep(0,7)
for (i in 1:7) {dif[i] <- abs(w2$weight[i] - W[i])}
dif

## [1] 0.246 0.414 1.418 2.376 3.334 5.292 7.250

max(dif)

## [1] 7.25

It seems the predict weight can be 7.25 lbs different from the actual weight base on the test.

Allen’s height is 7ft 7in. \(7ft. 7in = 91 in\) since \(1ft = 12 in\)

Predict Allen’s weight by using the above model.

91*3.042 - 62.274

## [1] 214.548

We will expecte Allen’s actual weight will be arround 214.548 lbs if we used this model.

If we use the entire dataset to build the model

WO <- lm(women$weight~women$height)
summary(WO)

## 
## Call:
## lm(formula = women$weight ~ women$height)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.7333 -1.1333 -0.3833  0.7417  3.1167 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -87.51667    5.93694  -14.74 1.71e-09 ***
## women$height   3.45000    0.09114   37.85 1.09e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.525 on 13 degrees of freedom
## Multiple R-squared:  0.991,  Adjusted R-squared:  0.9903 
## F-statistic:  1433 on 1 and 13 DF,  p-value: 1.091e-14

The model will be: \(W = -87.51667 + 3.45 \times H\)

Predict Allen’s weight by using the above model.

-87.51667 + 3.45*91

## [1] 226.4333

We will expecte Allen’s actual weight will be arround 226.4333 lbs if we used this model.

4.

Import the dataset

Data <- read.csv("D:/Okanagan/Winter 2020/DSCI 100/Assigment/Last assignment/myData.csv")
Data <- Data[,-1]

summary(Data)

##    Feature.1       Feature.2       Feature.3       Feature.4     
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.1000  
##  1st Qu.:5.400   1st Qu.:3.000   1st Qu.:2.090   1st Qu.:0.6809  
##  Median :6.072   Median :3.252   Median :4.500   Median :1.5000  
##  Mean   :6.095   Mean   :3.324   Mean   :4.020   Mean   :1.4423  
##  3rd Qu.:6.700   3rd Qu.:3.706   3rd Qu.:5.500   3rd Qu.:2.1000  
##  Max.   :8.550   Max.   :5.171   Max.   :7.511   Max.   :3.2646

Data <- na.omit(Data) 
Data1 <- scale(Data)

set.seed(011)
# Determine number of clusters
# Determine withinss for clusters 1 - 20 with 15 tries
wss <- 1
for (i in 1:20) wss[i] <- sum(kmeans(Data1,
centers=i, nstart=1000)$withinss)
# Plot the withinss for each cluster
plot(1:20, wss, type="b", xlab="Number of Clusters",
ylab="Within groups sum of squares")

For me the cluster sum of square after 9th cluster is low and the value is not within groups sum of squares does not change much after that.

set.seed(011)
#K-Means Cluster Analysis
fit <- kmeans(Data1, 9, nstart=1000) # 9 cluster solution
# fit contains all the data about
# each cluster
# get cluster means
aggregate(Data1,by=list(fit$cluster),FUN=mean)

# append cluster assignment to each data point in the data set
Data1 <- data.frame(Data1, fit$cluster)
head(Data1)

fit

## K-means clustering with 9 clusters of sizes 40, 22, 29, 40, 40, 32, 19, 58, 20
## 
## Cluster means:
##     Feature.1  Feature.2   Feature.3  Feature.4
## 1 -0.81769088  0.8234958 -1.23483967 -0.9784518
## 2  0.26453881  0.5315408  0.65251272  0.8004467
## 3 -0.59306212 -1.5394630  0.05047642 -0.1677887
## 4 -1.34213483 -0.1214047 -1.41012265 -1.4089059
## 5  0.51095192 -0.4568852  0.82892225  0.9756175
## 6  1.56888673  0.9393686  1.10569315  1.2709078
## 7  1.61527457 -0.5343622  1.29284141  0.9172624
## 8  0.09465705 -0.6453076  0.33013576  0.1242195
## 9 -0.45254008  2.0331610 -1.11357676 -1.0788054
## 
## Clustering vector:
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
##   4   4   4   4   4   1   4   4   4   4   1   4   4   4   1   9   1   4   1   1 
##  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40 
##   4   1   4   4   4   4   4   4   4   4   4   4   1   9   4   4   1   4   4   4 
##  41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60 
##   4   4   4   4   1   4   1   4   1   4   8   8   8   3   8   3   8   3   8   3 
##  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80 
##   3   8   3   8   3   8   8   3   3   3   8   8   8   8   8   8   8   8   8   3 
##  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 
##   3   3   3   8   8   8   8   3   8   3   3   8   3   3   3   8   8   8   3   3 
## 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 
##   5   8   7   8   5   7   3   7   5   6   5   8   5   3   5   5   5   6   7   3 
## 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 
##   5   8   7   8   5   7   8   8   5   7   7   6   5   8   8   7   2   5   8   5 
## 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 
##   5   5   8   5   5   5   8   5   2   8   1   1   1   1   1   9   1   9   1   1 
## 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 
##   9   4   4   1   9   9   9   1   9   9   9   1   1   1   9   1   1   9   1   1 
## 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 
##   1   1   9   9   1   1   9   1   4   1   9   4   1   1   1   1   9   1   9   9 
## 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 
##   6   2   6   8   6   5   6   8   7   8   3   2   8   2   8   6   5   8   5   8 
## 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 
##   2   8   5   2   6   6   6   6   5   8   3   8   2   5   2   2   7   5   2   2 
## 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 
##   8   2   8   8   8   8   2   8   8   2   6   5   6   6   6   6   3   7   7   6 
## 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 
##   2   5   6   5   2   6   7   6   7   3   6   5   7   7   6   6   5   2   5   6 
## 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 
##   7   6   6   5   5   6   6   2   5   6   7   6   5   5   6   5   5   2   2   2 
## 
## Within cluster sum of squares by cluster:
## [1] 19.995102 11.336938 15.478382 15.115263 15.118846 22.972132 12.029487
## [8] 26.892726  9.645307
##  (between_SS / total_SS =  87.6 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

Table of Cluster center:

ClusCen <- aggregate(Data1,by=list(fit$cluster),FUN=mean)
ClusCen <- ClusCen[,-1]

Scale the point <5.3, 3.1, 1.9, 1>

zScorces <- function(x,mu,sig){
  z = (x-mu)/sig
}
p1 <- seq(0,3)
p1[1] <- zScorces(5.3, mean(Data$Feature.1),sd(Data$Feature.1))
p1[2] <- zScorces(3.1, mean(Data$Feature.2),sd(Data$Feature.2))
p1[3] <- zScorces(1.9, mean(Data$Feature.3),sd(Data$Feature.3))
p1[4] <- zScorces(1, mean(Data$Feature.4),sd(Data$Feature.4))

Distance of the point to each cluster center:

dis <- seq(0,8)
for (k in 1:9) {
  dis[k] = (ClusCen[k,1]-p1[1])^2 + (ClusCen[k,2]-p1[2])^2 + (ClusCen[k,3]-p1[3])^2 + (ClusCen[k,4]-p1[4])^2
}
# The distance of the point to each cluster center in order of cluster 1 to 9
dis

## [1]  1.737338  7.368410  3.020055  1.119844  8.279740 16.341063 14.500023
## [8]  3.742555  6.476245

The point belong the cluster that it has the closet distance to the cluster center

which.min(dis)

## [1] 4

The point <5.3, 3.1, 1.9, 1> belongs to cluster 4.

5.

Predict Feature 1 based on Feature 2,3,4.

Using KNN

Put Feature.1 into 5 smaller groups

range(Data$Feature.1)

## [1] 4.30000 8.54955

x1 <- (8.54955-4.3)/5

x2 <- 4.3
while (x2<8.54955) {
  print(x2)
  x2 = x2 + x1
}

## [1] 4.3
## [1] 5.14991
## [1] 5.99982
## [1] 6.84973
## [1] 7.69964

group1: less than 5.14991 group2: 5.14991 to less than 5.99982 group2: 5.99982 to less than 6.84973 group2: 6.84973 to less than 7.69964 group1: greater than or equal to 7.69964

group <- function(o){
  if (o <5.14991) {
"group1"
} else if ( o >= 5.14991 & o < 5.99982) {
"group2"
} else if ( o >= 5.99982 & o < 6.84973) {
"group3"
} else if ( o >= 6.84973 & o < 7.69964) {
"group4"
}else {
"group5"
}
}

Group <- seq(0,299)
for(q in 1:300){
  Group[q] <- group(Data$Feature.1[q])
}
Data2 <- cbind(Data,Group)
head(Data2)

summary(Data2)

##    Feature.1       Feature.2       Feature.3       Feature.4         Group   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.1000   group1:53  
##  1st Qu.:5.400   1st Qu.:3.000   1st Qu.:2.090   1st Qu.:0.6809   group2:87  
##  Median :6.072   Median :3.252   Median :4.500   Median :1.5000   group3:97  
##  Mean   :6.095   Mean   :3.324   Mean   :4.020   Mean   :1.4423   group4:47  
##  3rd Qu.:6.700   3rd Qu.:3.706   3rd Qu.:5.500   3rd Qu.:2.1000   group5:16  
##  Max.   :8.550   Max.   :5.171   Max.   :7.511   Max.   :3.2646

Generate a random sample that is 90% of the total number of rows in dataset for the training data.

set.seed(011)
SampleSet <- sample(1:nrow(Data2), 0.9 * nrow(Data2))

Normalized the variables

Normalizer <-function(x) { (x -min(x))/(max(x)-min(x)) }

NormDataSet <- as.data.frame(lapply(Data2[,c(2,3,4)], Normalizer))

summary(NormDataSet)

##    Feature.2        Feature.3        Feature.4     
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.3154   1st Qu.:0.1674   1st Qu.:0.1836  
##  Median :0.3947   Median :0.5375   Median :0.4424  
##  Mean   :0.4175   Mean   :0.4639   Mean   :0.4241  
##  3rd Qu.:0.5380   3rd Qu.:0.6911   3rd Qu.:0.6320  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000

Training dataset

TrainingDataSet <- NormDataSet[SampleSet,]

Testing dataset

TestingDataSet <- NormDataSet[-SampleSet,]

Extract 5th column

FeatureTrainCategory <- Data2[SampleSet,5]

FeatureTestCategory <- Data2[-SampleSet,5]

Run KNN with k = 5

KNNTest <- knn(TrainingDataSet,TestingDataSet,cl=FeatureTrainCategory,k=5)

Create a confusion matrix to test the model

CM <- table(KNNTest,FeatureTestCategory)
CM

##         FeatureTestCategory
## KNNTest  group1 group2 group3 group4 group5
##   group1      6      0      0      0      0
##   group2      2      5      0      0      0
##   group3      0      1     10      2      0
##   group4      0      0      0      3      0
##   group5      0      0      0      1      0

Check for accuracy

accuracy <- function(x){sum(diag(x)/(sum(rowSums(x)))) * 100}
accuracy(CM)

## [1] 80

Try the model:

Given Feature 2,3,4 as following <2.3, 5.3, 0.8>

knn(TrainingDataSet,c(4.16, 2.48, 0.45),cl=FeatureTrainCategory,k=5)

## [1] group5
## Levels: group1 group2 group3 group4 group5

ML-Algorithm-Assignment

Minh Le

4/7/2020

1.

2.

3.

4.

5.