kMeans Clustering
# Installing Required packages
install.packages(c("stats","klaR","e1071","ggplot2","neuralnet"),repos ="https://cloud.r-project.org/",dependencies= TRUE)
## Installing packages into 'C:/Users/HP/Documents/R/win-library/3.3'
## (as 'lib' is unspecified)
## Warning: package 'stats' is not available (for R version 3.3.1)
## Warning: package 'stats' is a base package, and should not be updated
## Package which is only available in source form, and may need
## compilation of C/C++/Fortran: 'e1071'
## These will not be installed
## installing the source packages 'klaR', 'ggplot2', 'neuralnet'
## Warning: running command '"C:/PROGRA~1/R/R-33~1.1/bin/i386/R" CMD INSTALL -
## l "C:\Users\HP\Documents\R\win-library\3.3" C:\Users\HP\AppData\Local\Temp
## \RtmpCO3Oix/downloaded_packages/ggplot2_2.1.0.tar.gz' had status 3
## Warning in install.packages(c("stats", "klaR", "e1071", "ggplot2",
## "neuralnet"), : installation of package 'ggplot2' had non-zero exit status
# Loading the library
#library("stats")
#library("klaR")
#library("RODBC")
#library("e1071")
#library("ggplot2")
#library("neuralnet")
# Set The Working Directory
getwd()
## [1] "E:/RAW/Sample/Loogfiles/R_Working_Directory"
setwd("E:/RAW/Sample/Loogfiles/R_Working_Directory")
# Reading the Csv From the Destination & Delateing Na Values
Age_Inc<-read.csv(file="k_clust.csv",header=TRUE)
Removing the NA Values and Data Preparation
Clustering Using Kmeans Clustering
## [1] "Age" "Income"
## [1] 500 2
## 'data.frame': 500 obs. of 2 variables:
## $ Age : int 41 41 50 48 43 49 42 32 44 49 ...
## $ Income: int 48574 92285 18216 44784 35735 26343 39296 37235 61425 69680 ...
# Performing using Kmeans Clustering Algorithm
AGE_INC_data<-kmeans(x=AGE_INC,centers=5)
AGE_INC_data
## K-means clustering with 5 clusters of sizes 83, 114, 110, 98, 95
##
## Cluster means:
## Age Income
## 1 37.79518 90904.65
## 2 37.23684 39008.91
## 3 37.22727 57583.15
## 4 37.31633 18559.60
## 5 35.89474 75457.33
##
## Clustering vector:
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
## 3 1 4 2 2 4 2 2 3 5 3 2 4 2 3 5 3 1
## 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
## 5 3 5 4 3 5 1 1 4 4 4 2 2 3 1 5 5 3
## 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
## 3 4 2 5 2 3 1 3 5 1 2 2 4 3 3 3 5 3
## 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
## 3 4 2 2 2 3 2 4 2 2 5 2 3 1 1 3 1 3
## 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
## 3 2 1 1 1 1 3 2 5 3 1 1 2 5 3 3 5 3
## 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108
## 1 3 4 3 1 5 2 3 2 2 1 4 3 3 1 4 5 3
## 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
## 4 4 2 2 4 2 5 2 2 5 3 4 5 2 4 4 3 1
## 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
## 2 3 3 2 2 1 3 1 5 3 4 1 3 3 2 3 1 2
## 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162
## 4 3 5 2 2 5 4 3 5 4 5 1 1 4 4 1 4 4
## 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
## 4 2 2 5 2 4 2 1 2 4 3 4 5 1 2 4 4 2
## 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198
## 3 5 2 4 5 4 5 2 4 4 2 5 5 3 2 4 3 3
## 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216
## 1 4 1 5 3 5 3 5 3 5 4 4 1 3 3 1 2 4
## 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234
## 1 5 2 5 2 5 2 3 3 1 5 5 5 4 4 1 5 1
## 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252
## 2 4 2 5 2 3 3 1 4 3 5 2 4 3 1 4 3 4
## 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270
## 5 5 3 3 5 2 3 3 2 1 2 5 5 1 5 5 1 1
## 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288
## 4 5 2 4 1 4 5 5 5 3 4 2 2 3 1 3 2 3
## 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306
## 3 2 1 2 4 4 2 5 5 4 4 2 2 5 5 3 5 2
## 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324
## 3 1 3 1 2 1 3 2 4 1 5 4 1 2 3 5 3 3
## 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342
## 4 4 2 2 4 1 3 2 3 2 3 5 1 4 2 3 5 4
## 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360
## 4 4 3 5 1 2 2 3 5 5 1 5 2 4 1 4 4 1
## 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378
## 4 1 5 2 1 3 1 5 4 4 2 3 5 2 3 5 1 5
## 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396
## 4 2 3 1 2 4 1 1 2 1 5 1 4 2 4 2 5 4
## 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414
## 3 1 3 3 4 1 2 5 2 3 3 1 2 3 4 3 5 3
## 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432
## 2 3 2 2 1 1 4 5 5 4 5 5 3 4 4 1 1 4
## 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450
## 2 4 2 4 4 5 5 5 1 5 4 2 5 3 4 4 3 2
## 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468
## 5 5 1 5 3 2 3 2 2 2 2 5 1 3 3 2 1 2
## 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486
## 4 5 2 4 1 4 3 2 2 2 3 5 1 5 4 2 3 5
## 487 488 489 490 491 492 493 494 495 496 497 498 499 500
## 4 2 3 4 3 1 4 5 2 1 2 3 3 1
##
## Within cluster sum of squares by cluster:
## [1] 1996940454 3704239422 2721563166 3009374895 2068853950
## (between_SS / total_SS = 95.8 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
names(AGE_INC_data)
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
AGE_INC_data$cluster
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
## 3 1 4 2 2 4 2 2 3 5 3 2 4 2 3 5 3 1
## 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
## 5 3 5 4 3 5 1 1 4 4 4 2 2 3 1 5 5 3
## 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
## 3 4 2 5 2 3 1 3 5 1 2 2 4 3 3 3 5 3
## 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
## 3 4 2 2 2 3 2 4 2 2 5 2 3 1 1 3 1 3
## 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
## 3 2 1 1 1 1 3 2 5 3 1 1 2 5 3 3 5 3
## 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108
## 1 3 4 3 1 5 2 3 2 2 1 4 3 3 1 4 5 3
## 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
## 4 4 2 2 4 2 5 2 2 5 3 4 5 2 4 4 3 1
## 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
## 2 3 3 2 2 1 3 1 5 3 4 1 3 3 2 3 1 2
## 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162
## 4 3 5 2 2 5 4 3 5 4 5 1 1 4 4 1 4 4
## 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
## 4 2 2 5 2 4 2 1 2 4 3 4 5 1 2 4 4 2
## 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198
## 3 5 2 4 5 4 5 2 4 4 2 5 5 3 2 4 3 3
## 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216
## 1 4 1 5 3 5 3 5 3 5 4 4 1 3 3 1 2 4
## 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234
## 1 5 2 5 2 5 2 3 3 1 5 5 5 4 4 1 5 1
## 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252
## 2 4 2 5 2 3 3 1 4 3 5 2 4 3 1 4 3 4
## 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270
## 5 5 3 3 5 2 3 3 2 1 2 5 5 1 5 5 1 1
## 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288
## 4 5 2 4 1 4 5 5 5 3 4 2 2 3 1 3 2 3
## 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306
## 3 2 1 2 4 4 2 5 5 4 4 2 2 5 5 3 5 2
## 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324
## 3 1 3 1 2 1 3 2 4 1 5 4 1 2 3 5 3 3
## 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342
## 4 4 2 2 4 1 3 2 3 2 3 5 1 4 2 3 5 4
## 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360
## 4 4 3 5 1 2 2 3 5 5 1 5 2 4 1 4 4 1
## 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378
## 4 1 5 2 1 3 1 5 4 4 2 3 5 2 3 5 1 5
## 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396
## 4 2 3 1 2 4 1 1 2 1 5 1 4 2 4 2 5 4
## 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414
## 3 1 3 3 4 1 2 5 2 3 3 1 2 3 4 3 5 3
## 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432
## 2 3 2 2 1 1 4 5 5 4 5 5 3 4 4 1 1 4
## 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450
## 2 4 2 4 4 5 5 5 1 5 4 2 5 3 4 4 3 2
## 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468
## 5 5 1 5 3 2 3 2 2 2 2 5 1 3 3 2 1 2
## 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486
## 4 5 2 4 1 4 3 2 2 2 3 5 1 5 4 2 3 5
## 487 488 489 490 491 492 493 494 495 496 497 498 499 500
## 4 2 3 4 3 1 4 5 2 1 2 3 3 1
AGE_INC_data$centers
## Age Income
## 1 37.79518 90904.65
## 2 37.23684 39008.91
## 3 37.22727 57583.15
## 4 37.31633 18559.60
## 5 35.89474 75457.33
AGE_INC_Cluster<-cbind(AGE_INC,AGE_INC_data$cluster)
View(AGE_INC_Cluster)
# Ploting the obtained information
plot(AGE_INC,col=AGE_INC_data$cluster)
points(AGE_INC_data$centers,col=1:3,pch=8,cex=2)

Hierarchical clustering
# Installing Required packages
#install.packages(c("stats","klaR","cluster"),repos ="https://cloud.r-project.org/",dependencies = TRUE)
# Loading the library
library("stats","klaR","cluster")
# calculating distance matrix
d<-dist(mtcars)
# perforing Hierarical clustering
ward<-hclust(d,method="ward.D2")
single<-hclust(d,method="single")
# ploting the obtained information
plot(ward)
rect.hclust(ward,k=4)

plot(single)
rect.hclust(single,k=4)

ANOVA
# creating the Data
shelf1<-c(21,20,16,18,17,13,18,17,17,22)
shelf2<-c(19,18,21,25,21,16,12,16,14,19)
shelf3<-c(17,22,15,10,17,21,18,15,21,18)
shelf4<-c(12,10,17,17,14,18,19,20,23,15)
# View the structure of data
store <-data.frame(shelf1,shelf2,shelf3,shelf4)
View(store)
colMeans(store)
## shelf1 shelf2 shelf3 shelf4
## 17.9 18.1 17.4 16.5
store1<-stack(store)
View(store1)
names(store1)<-c("Values","Shelf")
# Performing Anova
result<-aov(Values~Shelf,data=store1)
summary(result)
## Df Sum Sq Mean Sq F value Pr(>F)
## Shelf 3 15.3 5.092 0.418 0.741
## Residuals 36 438.7 12.186
# reading the Data into R studio
basketball<-read.csv(file="Football.csv",header=TRUE)
View(basketball)
names(basketball)
## [1] "Time" "Shoes" "Shots.Made"
attach(basketball)
# perforing two way Anova
twowayanova<-aov(Shots.Made~Time*Shoes)
summary(twowayanova)
## Df Sum Sq Mean Sq F value Pr(>F)
## Time 1 7.56 7.56 0.344 0.568
## Shoes 1 39.06 39.06 1.777 0.207
## Time:Shoes 1 18.06 18.06 0.822 0.382
## Residuals 12 263.75 21.98
Naive Bayes
# install and load the ggplot library
## Package e1071(Naive Bayes)
#library("RODBC","e1071","ggplot2")
library("e1071")
# reading the Data into R studio
df_TM<-read.csv(file="NB_CSV.csv",header = TRUE)
#View(df_TM)
## Summary of the dataset
summary(df_TM)
## CustomerKey MaritalStatus Gender TotalChildren
## Min. :11000 M:10011 F:9133 Min. :0.000
## 1st Qu.:15621 S: 8473 M:9351 1st Qu.:0.000
## Median :20242 Median :2.000
## Mean :20242 Mean :1.844
## 3rd Qu.:24862 3rd Qu.:3.000
## Max. :29483 Max. :5.000
## NumberChildrenAtHome Education Occupation
## Min. :0.000 Bachelors :5356 Clerical :2928
## 1st Qu.:0.000 Graduate Degree :3189 Management :3075
## Median :0.000 High School :3294 Manual :2384
## Mean :1.004 Partial College :5064 Professional :5520
## 3rd Qu.:2.000 Partial High School:1581 Skilled Manual:4577
## Max. :5.000
## HouseOwnerFlag NumberCarsOwned CommuteDistance Region
## Min. :0.0000 Min. :0.000 0-1 Miles :6310 Europe :5503
## 1st Qu.:0.0000 1st Qu.:1.000 1-2 Miles :3232 North America:9390
## Median :1.0000 Median :2.000 10+ Miles :2494 Pacific :3591
## Mean :0.6764 Mean :1.503 2-5 Miles :3234
## 3rd Qu.:1.0000 3rd Qu.:2.000 5-10 Miles:3214
## Max. :1.0000 Max. :4.000
## BikeBuyer
## Min. :0.000
## 1st Qu.:0.000
## Median :0.000
## Mean :0.494
## 3rd Qu.:1.000
## Max. :1.000
## Average and Count with group by
colMeans(df_TM['NumberCarsOwned'])
## NumberCarsOwned
## 1.502705
aggregate(CustomerKey~Gender+Education,data = df_TM,FUN = length)
## Gender Education CustomerKey
## 1 F Bachelors 2628
## 2 M Bachelors 2728
## 3 F Graduate Degree 1611
## 4 M Graduate Degree 1578
## 5 F High School 1595
## 6 M High School 1699
## 7 F Partial College 2542
## 8 M Partial College 2522
## 9 F Partial High School 757
## 10 M Partial High School 824
plot with count (number) education by region
ggplot(df_TM,aes(Region,fill=Education))+geom_bar()
plots 2 seperate bar pots “Maritual status”, one for married and orther for single
ggplot(df_TM, aes(Occupation))+geom_histogram(color=“white”)+facet_grid(MaritalStatus ~ .)
# Build the Navies Bayes model
#require(e1071)
#library("e1071")
TMNB<-naiveBayes(df_TM[,2:11],df_TM[,12])
names(df_TM)
## [1] "CustomerKey" "MaritalStatus" "Gender"
## [4] "TotalChildren" "NumberChildrenAtHome" "Education"
## [7] "Occupation" "HouseOwnerFlag" "NumberCarsOwned"
## [10] "CommuteDistance" "Region" "BikeBuyer"
# apriori probabilities for the target variable ()
TMNB$apriori
## df_TM[, 12]
## 0 1
## 9352 9132
TMNB$tables
## $MaritalStatus
## MaritalStatus
## df_TM[, 12] M S
## 0 0.5731394 0.4268606
## 1 0.5093079 0.4906921
##
## $Gender
## Gender
## df_TM[, 12] F M
## 0 0.4850299 0.5149701
## 1 0.5033947 0.4966053
##
## $TotalChildren
## TotalChildren
## df_TM[, 12] [,1] [,2]
## 0 2.046942 1.703394
## 1 1.636881 1.485295
##
## $NumberChildrenAtHome
## NumberChildrenAtHome
## df_TM[, 12] [,1] [,2]
## 0 1.1345167 1.618151
## 1 0.8704555 1.405815
##
## $Education
## Education
## df_TM[, 12] Bachelors Graduate Degree High School Partial College
## 0 0.25053464 0.15911035 0.20455518 0.26946108
## 1 0.32993868 0.18626807 0.15122646 0.27858081
## Education
## df_TM[, 12] Partial High School
## 0 0.11633875
## 1 0.05398598
##
## $Occupation
## Occupation
## df_TM[, 12] Clerical Management Manual Professional Skilled Manual
## 0 0.1378315 0.1755774 0.1437126 0.2941617 0.2487169
## 1 0.1794788 0.1569207 0.1138852 0.3032194 0.2464958
##
## $HouseOwnerFlag
## HouseOwnerFlag
## df_TM[, 12] [,1] [,2]
## 0 0.6729042 0.4691777
## 1 0.6799168 0.4665338
##
## $NumberCarsOwned
## NumberCarsOwned
## df_TM[, 12] [,1] [,2]
## 0 1.706159 1.119266
## 1 1.294350 1.120042
##
## $CommuteDistance
## CommuteDistance
## df_TM[, 12] 0-1 Miles 1-2 Miles 10+ Miles 2-5 Miles 5-10 Miles
## 0 0.29640719 0.18210009 0.16958939 0.14916595 0.20273738
## 1 0.38742882 0.16743320 0.09943057 0.20137976 0.14432764
##
## $Region
## Region
## df_TM[, 12] Europe North America Pacific
## 0 0.2941617 0.5522883 0.1535500
## 1 0.3013579 0.4626588 0.2359834
# predictons
#predict(TMNB,df_TM,type = "raw")
#data frame with predictions for all rows
df_PR<-as.data.frame(predict(TMNB,df_TM,type="raw"))
# Combine original data with predictions
df_TM_PR<-cbind(df_TM,df_PR)
#View(df_TM_PR)
plot(df_TM_PR$`0`,df_TM_PR$`1`)

boxplot(df_TM_PR$`0`,df_TM_PR$`1`)

Neural networks
#install.packages("neuralnet")
#library("neuralnet")
# reading the Data into R studio
# view the structue of data
df_TM_NN<-read.csv(file="LG_CSV.csv",header = TRUE)
names(df_TM_NN)
## [1] "CustomerKey" "MaritalStatus" "Gender"
## [4] "TotalChildren" "NumberChildrenAtHome" "Education"
## [7] "Occupation" "HouseOwnerFlag" "NumberCarsOwned"
## [10] "CommuteDistance" "Region" "BikeBuyer"
## [13] "Age" "YearlyIncome"
# limit the dataset
ds<-df_TM_NN[1:500,]
#View(df_TM_NN)
library("neuralnet")
# train the neural network with single hidde layer
TMNN<-neuralnet(BikeBuyer~YearlyIncome+Age+NumberCarsOwned,ds,hidden = 1)
# basic data
#print(TMNN)
#predict
#prediction(TMNN)
# plots
plot(TMNN)