## Warning: package 'class' was built under R version 3.6.3
Exploratory analysis
str(women)
## 'data.frame': 15 obs. of 2 variables:
## $ height: num 58 59 60 61 62 63 64 65 66 67 ...
## $ weight: num 115 117 120 123 126 129 132 135 139 142 ...
The height variable was measured in inch and weight was measured in pounds
summary(women)
## height weight
## Min. :58.0 Min. :115.0
## 1st Qu.:61.5 1st Qu.:124.5
## Median :65.0 Median :135.0
## Mean :65.0 Mean :136.7
## 3rd Qu.:68.5 3rd Qu.:148.0
## Max. :72.0 Max. :164.0
The mean and the meadian of height have the same value. The value of mean and meadian of weight are close (1.7 lbs different)
It seems the distribution of height and weight are both symmetrical.
boxplot(women$height, main = "Boxplot of height")
boxplot(women$weight, main = "Boxplot of weight")
plot(women$height, women$weight, xlab = "Height", ylab = "Weight", pch =19)
Based on the plot, there might be a correlation between height and weight.
Hypothesis: There is a positive correlation between women height and their weight.
cor.test(women$height,women$weight)
##
## Pearson's product-moment correlation
##
## data: women$height and women$weight
## t = 37.855, df = 13, p-value = 1.091e-14
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9860970 0.9985447
## sample estimates:
## cor
## 0.9954948
We have \(p-value = 1.091^{-14} (< 0.05)\) and \(cor.coeff = 0.9954948 \approx 1\)
Based on Pearson test, there is a strong positive correlation between the height and weight of women.
plot(women$height, women$weight, xlab = "Height", ylab = "Weight", pch =19, main = "Relationship of Height and Weight")
abline(lm(women$weight~women$height))
Based on the graph in question 2, I can build linear regression to predict the weight of Sandra Elaine Allen using her height.
Divide the dataset into 2 smaller datasets randomly (w1 and w2)
set.seed(011)
w1 <- women[sample(nrow(women), 8), ]
w2 <- women[ !(women$height %in% w1$height), ]
Use w1 to build the model
lm(w1$weight~w1$height)
##
## Call:
## lm(formula = w1$weight ~ w1$height)
##
## Coefficients:
## (Intercept) w1$height
## -62.274 3.042
Model: (W for weight and H for height)
\(W = -62.274 + 3.042 \times H\)
Test the model using w2:
W <- rep(0,7)
for (i in 1:7) {W[i] <- w2$height[i]*3.042 - 62.274
}
W
## [1] 120.246 132.414 144.582 147.624 150.666 153.708 156.750
Error of the model:
dif <- rep(0,7)
for (i in 1:7) {dif[i] <- abs(w2$weight[i] - W[i])}
dif
## [1] 0.246 0.414 1.418 2.376 3.334 5.292 7.250
max(dif)
## [1] 7.25
It seems the predict weight can be 7.25 lbs different from the actual weight base on the test.
Allen’s height is 7ft 7in. \(7ft. 7in = 91 in\) since \(1ft = 12 in\)
Predict Allen’s weight by using the above model.
91*3.042 - 62.274
## [1] 214.548
We will expecte Allen’s actual weight will be arround 214.548 lbs if we used this model.
If we use the entire dataset to build the model
WO <- lm(women$weight~women$height)
summary(WO)
##
## Call:
## lm(formula = women$weight ~ women$height)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.7333 -1.1333 -0.3833 0.7417 3.1167
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -87.51667 5.93694 -14.74 1.71e-09 ***
## women$height 3.45000 0.09114 37.85 1.09e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.525 on 13 degrees of freedom
## Multiple R-squared: 0.991, Adjusted R-squared: 0.9903
## F-statistic: 1433 on 1 and 13 DF, p-value: 1.091e-14
The model will be: \(W = -87.51667 + 3.45 \times H\)
Predict Allen’s weight by using the above model.
-87.51667 + 3.45*91
## [1] 226.4333
We will expecte Allen’s actual weight will be arround 226.4333 lbs if we used this model.
Import the dataset
Data <- read.csv("D:/Okanagan/Winter 2020/DSCI 100/Assigment/Last assignment/myData.csv")
Data <- Data[,-1]
summary(Data)
## Feature.1 Feature.2 Feature.3 Feature.4
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.1000
## 1st Qu.:5.400 1st Qu.:3.000 1st Qu.:2.090 1st Qu.:0.6809
## Median :6.072 Median :3.252 Median :4.500 Median :1.5000
## Mean :6.095 Mean :3.324 Mean :4.020 Mean :1.4423
## 3rd Qu.:6.700 3rd Qu.:3.706 3rd Qu.:5.500 3rd Qu.:2.1000
## Max. :8.550 Max. :5.171 Max. :7.511 Max. :3.2646
Data <- na.omit(Data)
Data1 <- scale(Data)
set.seed(011)
# Determine number of clusters
# Determine withinss for clusters 1 - 20 with 15 tries
wss <- 1
for (i in 1:20) wss[i] <- sum(kmeans(Data1,
centers=i, nstart=1000)$withinss)
# Plot the withinss for each cluster
plot(1:20, wss, type="b", xlab="Number of Clusters",
ylab="Within groups sum of squares")
For me the cluster sum of square after 9th cluster is low and the value is not within groups sum of squares does not change much after that.
set.seed(011)
#K-Means Cluster Analysis
fit <- kmeans(Data1, 9, nstart=1000) # 9 cluster solution
# fit contains all the data about
# each cluster
# get cluster means
aggregate(Data1,by=list(fit$cluster),FUN=mean)
# append cluster assignment to each data point in the data set
Data1 <- data.frame(Data1, fit$cluster)
head(Data1)
fit
## K-means clustering with 9 clusters of sizes 40, 22, 29, 40, 40, 32, 19, 58, 20
##
## Cluster means:
## Feature.1 Feature.2 Feature.3 Feature.4
## 1 -0.81769088 0.8234958 -1.23483967 -0.9784518
## 2 0.26453881 0.5315408 0.65251272 0.8004467
## 3 -0.59306212 -1.5394630 0.05047642 -0.1677887
## 4 -1.34213483 -0.1214047 -1.41012265 -1.4089059
## 5 0.51095192 -0.4568852 0.82892225 0.9756175
## 6 1.56888673 0.9393686 1.10569315 1.2709078
## 7 1.61527457 -0.5343622 1.29284141 0.9172624
## 8 0.09465705 -0.6453076 0.33013576 0.1242195
## 9 -0.45254008 2.0331610 -1.11357676 -1.0788054
##
## Clustering vector:
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## 4 4 4 4 4 1 4 4 4 4 1 4 4 4 1 9 1 4 1 1
## 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
## 4 1 4 4 4 4 4 4 4 4 4 4 1 9 4 4 1 4 4 4
## 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
## 4 4 4 4 1 4 1 4 1 4 8 8 8 3 8 3 8 3 8 3
## 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
## 3 8 3 8 3 8 8 3 3 3 8 8 8 8 8 8 8 8 8 3
## 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
## 3 3 3 8 8 8 8 3 8 3 3 8 3 3 3 8 8 8 3 3
## 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
## 5 8 7 8 5 7 3 7 5 6 5 8 5 3 5 5 5 6 7 3
## 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
## 5 8 7 8 5 7 8 8 5 7 7 6 5 8 8 7 2 5 8 5
## 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160
## 5 5 8 5 5 5 8 5 2 8 1 1 1 1 1 9 1 9 1 1
## 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
## 9 4 4 1 9 9 9 1 9 9 9 1 1 1 9 1 1 9 1 1
## 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200
## 1 1 9 9 1 1 9 1 4 1 9 4 1 1 1 1 9 1 9 9
## 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220
## 6 2 6 8 6 5 6 8 7 8 3 2 8 2 8 6 5 8 5 8
## 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240
## 2 8 5 2 6 6 6 6 5 8 3 8 2 5 2 2 7 5 2 2
## 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260
## 8 2 8 8 8 8 2 8 8 2 6 5 6 6 6 6 3 7 7 6
## 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280
## 2 5 6 5 2 6 7 6 7 3 6 5 7 7 6 6 5 2 5 6
## 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300
## 7 6 6 5 5 6 6 2 5 6 7 6 5 5 6 5 5 2 2 2
##
## Within cluster sum of squares by cluster:
## [1] 19.995102 11.336938 15.478382 15.115263 15.118846 22.972132 12.029487
## [8] 26.892726 9.645307
## (between_SS / total_SS = 87.6 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
Table of Cluster center:
ClusCen <- aggregate(Data1,by=list(fit$cluster),FUN=mean)
ClusCen <- ClusCen[,-1]
Scale the point <5.3, 3.1, 1.9, 1>
zScorces <- function(x,mu,sig){
z = (x-mu)/sig
}
p1 <- seq(0,3)
p1[1] <- zScorces(5.3, mean(Data$Feature.1),sd(Data$Feature.1))
p1[2] <- zScorces(3.1, mean(Data$Feature.2),sd(Data$Feature.2))
p1[3] <- zScorces(1.9, mean(Data$Feature.3),sd(Data$Feature.3))
p1[4] <- zScorces(1, mean(Data$Feature.4),sd(Data$Feature.4))
Distance of the point to each cluster center:
dis <- seq(0,8)
for (k in 1:9) {
dis[k] = (ClusCen[k,1]-p1[1])^2 + (ClusCen[k,2]-p1[2])^2 + (ClusCen[k,3]-p1[3])^2 + (ClusCen[k,4]-p1[4])^2
}
# The distance of the point to each cluster center in order of cluster 1 to 9
dis
## [1] 1.737338 7.368410 3.020055 1.119844 8.279740 16.341063 14.500023
## [8] 3.742555 6.476245
The point belong the cluster that it has the closet distance to the cluster center
which.min(dis)
## [1] 4
The point <5.3, 3.1, 1.9, 1> belongs to cluster 4.
Predict Feature 1 based on Feature 2,3,4.
Using KNN
Put Feature.1 into 5 smaller groups
range(Data$Feature.1)
## [1] 4.30000 8.54955
x1 <- (8.54955-4.3)/5
x2 <- 4.3
while (x2<8.54955) {
print(x2)
x2 = x2 + x1
}
## [1] 4.3
## [1] 5.14991
## [1] 5.99982
## [1] 6.84973
## [1] 7.69964
group1: less than 5.14991 group2: 5.14991 to less than 5.99982 group2: 5.99982 to less than 6.84973 group2: 6.84973 to less than 7.69964 group1: greater than or equal to 7.69964
group <- function(o){
if (o <5.14991) {
"group1"
} else if ( o >= 5.14991 & o < 5.99982) {
"group2"
} else if ( o >= 5.99982 & o < 6.84973) {
"group3"
} else if ( o >= 6.84973 & o < 7.69964) {
"group4"
}else {
"group5"
}
}
Group <- seq(0,299)
for(q in 1:300){
Group[q] <- group(Data$Feature.1[q])
}
Data2 <- cbind(Data,Group)
head(Data2)
summary(Data2)
## Feature.1 Feature.2 Feature.3 Feature.4 Group
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.1000 group1:53
## 1st Qu.:5.400 1st Qu.:3.000 1st Qu.:2.090 1st Qu.:0.6809 group2:87
## Median :6.072 Median :3.252 Median :4.500 Median :1.5000 group3:97
## Mean :6.095 Mean :3.324 Mean :4.020 Mean :1.4423 group4:47
## 3rd Qu.:6.700 3rd Qu.:3.706 3rd Qu.:5.500 3rd Qu.:2.1000 group5:16
## Max. :8.550 Max. :5.171 Max. :7.511 Max. :3.2646
Generate a random sample that is 90% of the total number of rows in dataset for the training data.
set.seed(011)
SampleSet <- sample(1:nrow(Data2), 0.9 * nrow(Data2))
Normalized the variables
Normalizer <-function(x) { (x -min(x))/(max(x)-min(x)) }
NormDataSet <- as.data.frame(lapply(Data2[,c(2,3,4)], Normalizer))
summary(NormDataSet)
## Feature.2 Feature.3 Feature.4
## Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.3154 1st Qu.:0.1674 1st Qu.:0.1836
## Median :0.3947 Median :0.5375 Median :0.4424
## Mean :0.4175 Mean :0.4639 Mean :0.4241
## 3rd Qu.:0.5380 3rd Qu.:0.6911 3rd Qu.:0.6320
## Max. :1.0000 Max. :1.0000 Max. :1.0000
Training dataset
TrainingDataSet <- NormDataSet[SampleSet,]
Testing dataset
TestingDataSet <- NormDataSet[-SampleSet,]
Extract 5th column
FeatureTrainCategory <- Data2[SampleSet,5]
FeatureTestCategory <- Data2[-SampleSet,5]
Run KNN with k = 5
KNNTest <- knn(TrainingDataSet,TestingDataSet,cl=FeatureTrainCategory,k=5)
Create a confusion matrix to test the model
CM <- table(KNNTest,FeatureTestCategory)
CM
## FeatureTestCategory
## KNNTest group1 group2 group3 group4 group5
## group1 6 0 0 0 0
## group2 2 5 0 0 0
## group3 0 1 10 2 0
## group4 0 0 0 3 0
## group5 0 0 0 1 0
Check for accuracy
accuracy <- function(x){sum(diag(x)/(sum(rowSums(x)))) * 100}
accuracy(CM)
## [1] 80
Try the model:
Given Feature 2,3,4 as following <2.3, 5.3, 0.8>
knn(TrainingDataSet,c(4.16, 2.48, 0.45),cl=FeatureTrainCategory,k=5)
## [1] group5
## Levels: group1 group2 group3 group4 group5