kMeans Clustering

# Installing  Required  packages

install.packages(c("stats","klaR","e1071","ggplot2","neuralnet"),repos  ="https://cloud.r-project.org/",dependencies= TRUE)
## Installing packages into 'C:/Users/HP/Documents/R/win-library/3.3'
## (as 'lib' is unspecified)
## Warning: package 'stats' is not available (for R version 3.3.1)
## Warning: package 'stats' is a base package, and should not be updated
## Package which is only available in source form, and may need
##   compilation of C/C++/Fortran: 'e1071'
##   These will not be installed
## installing the source packages 'klaR', 'ggplot2', 'neuralnet'
## Warning: running command '"C:/PROGRA~1/R/R-33~1.1/bin/i386/R" CMD INSTALL -
## l "C:\Users\HP\Documents\R\win-library\3.3" C:\Users\HP\AppData\Local\Temp
## \RtmpCO3Oix/downloaded_packages/ggplot2_2.1.0.tar.gz' had status 3
## Warning in install.packages(c("stats", "klaR", "e1071", "ggplot2",
## "neuralnet"), : installation of package 'ggplot2' had non-zero exit status
# Loading the library
#library("stats")
#library("klaR")

#library("RODBC")
#library("e1071")
#library("ggplot2")
#library("neuralnet")
# Set The  Working  Directory
getwd()
## [1] "E:/RAW/Sample/Loogfiles/R_Working_Directory"
setwd("E:/RAW/Sample/Loogfiles/R_Working_Directory")
# Reading  the  Csv From the Destination & Delateing  Na Values
Age_Inc<-read.csv(file="k_clust.csv",header=TRUE)

Removing the NA Values and Data Preparation

Clustering Using Kmeans Clustering

## [1] "Age"    "Income"
## [1] 500   2
## 'data.frame':    500 obs. of  2 variables:
##  $ Age   : int  41 41 50 48 43 49 42 32 44 49 ...
##  $ Income: int  48574 92285 18216 44784 35735 26343 39296 37235 61425 69680 ...
# Performing using Kmeans Clustering Algorithm
AGE_INC_data<-kmeans(x=AGE_INC,centers=5)
AGE_INC_data
## K-means clustering with 5 clusters of sizes 83, 114, 110, 98, 95
## 
## Cluster means:
##        Age   Income
## 1 37.79518 90904.65
## 2 37.23684 39008.91
## 3 37.22727 57583.15
## 4 37.31633 18559.60
## 5 35.89474 75457.33
## 
## Clustering vector:
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18 
##   3   1   4   2   2   4   2   2   3   5   3   2   4   2   3   5   3   1 
##  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36 
##   5   3   5   4   3   5   1   1   4   4   4   2   2   3   1   5   5   3 
##  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54 
##   3   4   2   5   2   3   1   3   5   1   2   2   4   3   3   3   5   3 
##  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72 
##   3   4   2   2   2   3   2   4   2   2   5   2   3   1   1   3   1   3 
##  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90 
##   3   2   1   1   1   1   3   2   5   3   1   1   2   5   3   3   5   3 
##  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107 108 
##   1   3   4   3   1   5   2   3   2   2   1   4   3   3   1   4   5   3 
## 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 
##   4   4   2   2   4   2   5   2   2   5   3   4   5   2   4   4   3   1 
## 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 
##   2   3   3   2   2   1   3   1   5   3   4   1   3   3   2   3   1   2 
## 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 
##   4   3   5   2   2   5   4   3   5   4   5   1   1   4   4   1   4   4 
## 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 
##   4   2   2   5   2   4   2   1   2   4   3   4   5   1   2   4   4   2 
## 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 
##   3   5   2   4   5   4   5   2   4   4   2   5   5   3   2   4   3   3 
## 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 
##   1   4   1   5   3   5   3   5   3   5   4   4   1   3   3   1   2   4 
## 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 
##   1   5   2   5   2   5   2   3   3   1   5   5   5   4   4   1   5   1 
## 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 
##   2   4   2   5   2   3   3   1   4   3   5   2   4   3   1   4   3   4 
## 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 
##   5   5   3   3   5   2   3   3   2   1   2   5   5   1   5   5   1   1 
## 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 
##   4   5   2   4   1   4   5   5   5   3   4   2   2   3   1   3   2   3 
## 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 
##   3   2   1   2   4   4   2   5   5   4   4   2   2   5   5   3   5   2 
## 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 
##   3   1   3   1   2   1   3   2   4   1   5   4   1   2   3   5   3   3 
## 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 
##   4   4   2   2   4   1   3   2   3   2   3   5   1   4   2   3   5   4 
## 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 
##   4   4   3   5   1   2   2   3   5   5   1   5   2   4   1   4   4   1 
## 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 
##   4   1   5   2   1   3   1   5   4   4   2   3   5   2   3   5   1   5 
## 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 
##   4   2   3   1   2   4   1   1   2   1   5   1   4   2   4   2   5   4 
## 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 
##   3   1   3   3   4   1   2   5   2   3   3   1   2   3   4   3   5   3 
## 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 
##   2   3   2   2   1   1   4   5   5   4   5   5   3   4   4   1   1   4 
## 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 
##   2   4   2   4   4   5   5   5   1   5   4   2   5   3   4   4   3   2 
## 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 
##   5   5   1   5   3   2   3   2   2   2   2   5   1   3   3   2   1   2 
## 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 
##   4   5   2   4   1   4   3   2   2   2   3   5   1   5   4   2   3   5 
## 487 488 489 490 491 492 493 494 495 496 497 498 499 500 
##   4   2   3   4   3   1   4   5   2   1   2   3   3   1 
## 
## Within cluster sum of squares by cluster:
## [1] 1996940454 3704239422 2721563166 3009374895 2068853950
##  (between_SS / total_SS =  95.8 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"
names(AGE_INC_data)
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"
AGE_INC_data$cluster
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18 
##   3   1   4   2   2   4   2   2   3   5   3   2   4   2   3   5   3   1 
##  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36 
##   5   3   5   4   3   5   1   1   4   4   4   2   2   3   1   5   5   3 
##  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54 
##   3   4   2   5   2   3   1   3   5   1   2   2   4   3   3   3   5   3 
##  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72 
##   3   4   2   2   2   3   2   4   2   2   5   2   3   1   1   3   1   3 
##  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90 
##   3   2   1   1   1   1   3   2   5   3   1   1   2   5   3   3   5   3 
##  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107 108 
##   1   3   4   3   1   5   2   3   2   2   1   4   3   3   1   4   5   3 
## 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 
##   4   4   2   2   4   2   5   2   2   5   3   4   5   2   4   4   3   1 
## 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 
##   2   3   3   2   2   1   3   1   5   3   4   1   3   3   2   3   1   2 
## 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 
##   4   3   5   2   2   5   4   3   5   4   5   1   1   4   4   1   4   4 
## 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 
##   4   2   2   5   2   4   2   1   2   4   3   4   5   1   2   4   4   2 
## 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 
##   3   5   2   4   5   4   5   2   4   4   2   5   5   3   2   4   3   3 
## 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 
##   1   4   1   5   3   5   3   5   3   5   4   4   1   3   3   1   2   4 
## 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 
##   1   5   2   5   2   5   2   3   3   1   5   5   5   4   4   1   5   1 
## 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 
##   2   4   2   5   2   3   3   1   4   3   5   2   4   3   1   4   3   4 
## 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 
##   5   5   3   3   5   2   3   3   2   1   2   5   5   1   5   5   1   1 
## 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 
##   4   5   2   4   1   4   5   5   5   3   4   2   2   3   1   3   2   3 
## 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 
##   3   2   1   2   4   4   2   5   5   4   4   2   2   5   5   3   5   2 
## 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 
##   3   1   3   1   2   1   3   2   4   1   5   4   1   2   3   5   3   3 
## 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 
##   4   4   2   2   4   1   3   2   3   2   3   5   1   4   2   3   5   4 
## 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 
##   4   4   3   5   1   2   2   3   5   5   1   5   2   4   1   4   4   1 
## 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 
##   4   1   5   2   1   3   1   5   4   4   2   3   5   2   3   5   1   5 
## 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 
##   4   2   3   1   2   4   1   1   2   1   5   1   4   2   4   2   5   4 
## 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 
##   3   1   3   3   4   1   2   5   2   3   3   1   2   3   4   3   5   3 
## 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 
##   2   3   2   2   1   1   4   5   5   4   5   5   3   4   4   1   1   4 
## 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 
##   2   4   2   4   4   5   5   5   1   5   4   2   5   3   4   4   3   2 
## 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 
##   5   5   1   5   3   2   3   2   2   2   2   5   1   3   3   2   1   2 
## 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 
##   4   5   2   4   1   4   3   2   2   2   3   5   1   5   4   2   3   5 
## 487 488 489 490 491 492 493 494 495 496 497 498 499 500 
##   4   2   3   4   3   1   4   5   2   1   2   3   3   1
AGE_INC_data$centers
##        Age   Income
## 1 37.79518 90904.65
## 2 37.23684 39008.91
## 3 37.22727 57583.15
## 4 37.31633 18559.60
## 5 35.89474 75457.33
AGE_INC_Cluster<-cbind(AGE_INC,AGE_INC_data$cluster)
View(AGE_INC_Cluster)
# Ploting  the obtained information
plot(AGE_INC,col=AGE_INC_data$cluster)
points(AGE_INC_data$centers,col=1:3,pch=8,cex=2)

Hierarchical clustering

# Installing  Required  packages
#install.packages(c("stats","klaR","cluster"),repos ="https://cloud.r-project.org/",dependencies = TRUE)
# Loading the library
library("stats","klaR","cluster")
# calculating  distance  matrix
d<-dist(mtcars)
# perforing Hierarical clustering
ward<-hclust(d,method="ward.D2")
single<-hclust(d,method="single")
# ploting the obtained information
plot(ward)
rect.hclust(ward,k=4)

plot(single)
rect.hclust(single,k=4)

ANOVA

# creating  the  Data
shelf1<-c(21,20,16,18,17,13,18,17,17,22)
shelf2<-c(19,18,21,25,21,16,12,16,14,19)
shelf3<-c(17,22,15,10,17,21,18,15,21,18)
shelf4<-c(12,10,17,17,14,18,19,20,23,15)
# View the  structure  of data
store <-data.frame(shelf1,shelf2,shelf3,shelf4)
View(store)
colMeans(store)
## shelf1 shelf2 shelf3 shelf4 
##   17.9   18.1   17.4   16.5
store1<-stack(store)
View(store1)
names(store1)<-c("Values","Shelf")
# Performing Anova
result<-aov(Values~Shelf,data=store1)
summary(result)
##             Df Sum Sq Mean Sq F value Pr(>F)
## Shelf        3   15.3   5.092   0.418  0.741
## Residuals   36  438.7  12.186
# reading the Data into R studio
basketball<-read.csv(file="Football.csv",header=TRUE)
View(basketball)
names(basketball)
## [1] "Time"       "Shoes"      "Shots.Made"
attach(basketball)
# perforing  two  way Anova
twowayanova<-aov(Shots.Made~Time*Shoes)
summary(twowayanova)
##             Df Sum Sq Mean Sq F value Pr(>F)
## Time         1   7.56    7.56   0.344  0.568
## Shoes        1  39.06   39.06   1.777  0.207
## Time:Shoes   1  18.06   18.06   0.822  0.382
## Residuals   12 263.75   21.98

Naive Bayes

# install and  load the ggplot  library
## Package e1071(Naive Bayes)
#library("RODBC","e1071","ggplot2")
library("e1071")
# reading the Data into R studio
df_TM<-read.csv(file="NB_CSV.csv",header = TRUE)
#View(df_TM)
## Summary of  the  dataset
summary(df_TM)
##   CustomerKey    MaritalStatus Gender   TotalChildren  
##  Min.   :11000   M:10011       F:9133   Min.   :0.000  
##  1st Qu.:15621   S: 8473       M:9351   1st Qu.:0.000  
##  Median :20242                          Median :2.000  
##  Mean   :20242                          Mean   :1.844  
##  3rd Qu.:24862                          3rd Qu.:3.000  
##  Max.   :29483                          Max.   :5.000  
##  NumberChildrenAtHome               Education             Occupation  
##  Min.   :0.000        Bachelors          :5356   Clerical      :2928  
##  1st Qu.:0.000        Graduate Degree    :3189   Management    :3075  
##  Median :0.000        High School        :3294   Manual        :2384  
##  Mean   :1.004        Partial College    :5064   Professional  :5520  
##  3rd Qu.:2.000        Partial High School:1581   Skilled Manual:4577  
##  Max.   :5.000                                                        
##  HouseOwnerFlag   NumberCarsOwned   CommuteDistance           Region    
##  Min.   :0.0000   Min.   :0.000   0-1 Miles :6310   Europe       :5503  
##  1st Qu.:0.0000   1st Qu.:1.000   1-2 Miles :3232   North America:9390  
##  Median :1.0000   Median :2.000   10+ Miles :2494   Pacific      :3591  
##  Mean   :0.6764   Mean   :1.503   2-5 Miles :3234                       
##  3rd Qu.:1.0000   3rd Qu.:2.000   5-10 Miles:3214                       
##  Max.   :1.0000   Max.   :4.000                                         
##    BikeBuyer    
##  Min.   :0.000  
##  1st Qu.:0.000  
##  Median :0.000  
##  Mean   :0.494  
##  3rd Qu.:1.000  
##  Max.   :1.000
## Average and  Count with  group by
colMeans(df_TM['NumberCarsOwned'])
## NumberCarsOwned 
##        1.502705
aggregate(CustomerKey~Gender+Education,data = df_TM,FUN = length)
##    Gender           Education CustomerKey
## 1       F           Bachelors        2628
## 2       M           Bachelors        2728
## 3       F     Graduate Degree        1611
## 4       M     Graduate Degree        1578
## 5       F         High School        1595
## 6       M         High School        1699
## 7       F     Partial College        2542
## 8       M     Partial College        2522
## 9       F Partial High School         757
## 10      M Partial High School         824

plot with count (number) education by region

ggplot(df_TM,aes(Region,fill=Education))+geom_bar()

plots 2 seperate bar pots “Maritual status”, one for married and orther for single

ggplot(df_TM, aes(Occupation))+geom_histogram(color=“white”)+facet_grid(MaritalStatus ~ .)

# Build  the Navies Bayes  model 
#require(e1071)
#library("e1071")
TMNB<-naiveBayes(df_TM[,2:11],df_TM[,12])
names(df_TM)
##  [1] "CustomerKey"          "MaritalStatus"        "Gender"              
##  [4] "TotalChildren"        "NumberChildrenAtHome" "Education"           
##  [7] "Occupation"           "HouseOwnerFlag"       "NumberCarsOwned"     
## [10] "CommuteDistance"      "Region"               "BikeBuyer"
# apriori probabilities  for  the  target  variable ()
TMNB$apriori
## df_TM[, 12]
##    0    1 
## 9352 9132
TMNB$tables
## $MaritalStatus
##            MaritalStatus
## df_TM[, 12]         M         S
##           0 0.5731394 0.4268606
##           1 0.5093079 0.4906921
## 
## $Gender
##            Gender
## df_TM[, 12]         F         M
##           0 0.4850299 0.5149701
##           1 0.5033947 0.4966053
## 
## $TotalChildren
##            TotalChildren
## df_TM[, 12]     [,1]     [,2]
##           0 2.046942 1.703394
##           1 1.636881 1.485295
## 
## $NumberChildrenAtHome
##            NumberChildrenAtHome
## df_TM[, 12]      [,1]     [,2]
##           0 1.1345167 1.618151
##           1 0.8704555 1.405815
## 
## $Education
##            Education
## df_TM[, 12]  Bachelors Graduate Degree High School Partial College
##           0 0.25053464      0.15911035  0.20455518      0.26946108
##           1 0.32993868      0.18626807  0.15122646      0.27858081
##            Education
## df_TM[, 12] Partial High School
##           0          0.11633875
##           1          0.05398598
## 
## $Occupation
##            Occupation
## df_TM[, 12]  Clerical Management    Manual Professional Skilled Manual
##           0 0.1378315  0.1755774 0.1437126    0.2941617      0.2487169
##           1 0.1794788  0.1569207 0.1138852    0.3032194      0.2464958
## 
## $HouseOwnerFlag
##            HouseOwnerFlag
## df_TM[, 12]      [,1]      [,2]
##           0 0.6729042 0.4691777
##           1 0.6799168 0.4665338
## 
## $NumberCarsOwned
##            NumberCarsOwned
## df_TM[, 12]     [,1]     [,2]
##           0 1.706159 1.119266
##           1 1.294350 1.120042
## 
## $CommuteDistance
##            CommuteDistance
## df_TM[, 12]  0-1 Miles  1-2 Miles  10+ Miles  2-5 Miles 5-10 Miles
##           0 0.29640719 0.18210009 0.16958939 0.14916595 0.20273738
##           1 0.38742882 0.16743320 0.09943057 0.20137976 0.14432764
## 
## $Region
##            Region
## df_TM[, 12]    Europe North America   Pacific
##           0 0.2941617     0.5522883 0.1535500
##           1 0.3013579     0.4626588 0.2359834
# predictons
#predict(TMNB,df_TM,type = "raw")
#data frame  with predictions for  all rows 
df_PR<-as.data.frame(predict(TMNB,df_TM,type="raw"))
# Combine original data with predictions 
df_TM_PR<-cbind(df_TM,df_PR)
#View(df_TM_PR)
plot(df_TM_PR$`0`,df_TM_PR$`1`)

boxplot(df_TM_PR$`0`,df_TM_PR$`1`)

Neural networks

#install.packages("neuralnet")
#library("neuralnet")
# reading the Data into R studio
# view the structue of data
df_TM_NN<-read.csv(file="LG_CSV.csv",header = TRUE)
names(df_TM_NN)
##  [1] "CustomerKey"          "MaritalStatus"        "Gender"              
##  [4] "TotalChildren"        "NumberChildrenAtHome" "Education"           
##  [7] "Occupation"           "HouseOwnerFlag"       "NumberCarsOwned"     
## [10] "CommuteDistance"      "Region"               "BikeBuyer"           
## [13] "Age"                  "YearlyIncome"
# limit  the  dataset
ds<-df_TM_NN[1:500,]
#View(df_TM_NN)
library("neuralnet")
#  train  the  neural  network  with  single hidde layer
TMNN<-neuralnet(BikeBuyer~YearlyIncome+Age+NumberCarsOwned,ds,hidden = 1)
# basic  data
#print(TMNN)
#predict
#prediction(TMNN)
# plots
plot(TMNN)