About this Dataset Context This data set is created only for the learning purpose of the customer segmentation concepts , also known as market basket analysis . I will demonstrate this by using unsupervised ML technique (KMeans Clustering Algorithm) in the simplest form.
Content You are owing a supermarket mall and through membership cards , you have some basic data about your customers like Customer ID, age, gender, annual income and spending score. Spending Score is something you assign to the customer based on your defined parameters like customer behavior and purchasing data.
Problem Statement You own the mall and want to understand the customers like who can be easily converge [Target Customers] so that the sense can be given to marketing team and plan the strategy accordingly.
Importing the data
library(readr)
df <- read.csv('Mall_Customers.csv')
print(df)
## CustomerID Genre Age Annual.Income..k.. Spending.Score..1.100.
## 1 1 Male 19 15 39
## 2 2 Male 21 15 81
## 3 3 Female 20 16 6
## 4 4 Female 23 16 77
## 5 5 Female 31 17 40
## 6 6 Female 22 17 76
## 7 7 Female 35 18 6
## 8 8 Female 23 18 94
## 9 9 Male 64 19 3
## 10 10 Female 30 19 72
## 11 11 Male 67 19 14
## 12 12 Female 35 19 99
## 13 13 Female 58 20 15
## 14 14 Female 24 20 77
## 15 15 Male 37 20 13
## 16 16 Male 22 20 79
## 17 17 Female 35 21 35
## 18 18 Male 20 21 66
## 19 19 Male 52 23 29
## 20 20 Female 35 23 98
## 21 21 Male 35 24 35
## 22 22 Male 25 24 73
## 23 23 Female 46 25 5
## 24 24 Male 31 25 73
## 25 25 Female 54 28 14
## 26 26 Male 29 28 82
## 27 27 Female 45 28 32
## 28 28 Male 35 28 61
## 29 29 Female 40 29 31
## 30 30 Female 23 29 87
## 31 31 Male 60 30 4
## 32 32 Female 21 30 73
## 33 33 Male 53 33 4
## 34 34 Male 18 33 92
## 35 35 Female 49 33 14
## 36 36 Female 21 33 81
## 37 37 Female 42 34 17
## 38 38 Female 30 34 73
## 39 39 Female 36 37 26
## 40 40 Female 20 37 75
## 41 41 Female 65 38 35
## 42 42 Male 24 38 92
## 43 43 Male 48 39 36
## 44 44 Female 31 39 61
## 45 45 Female 49 39 28
## 46 46 Female 24 39 65
## 47 47 Female 50 40 55
## 48 48 Female 27 40 47
## 49 49 Female 29 40 42
## 50 50 Female 31 40 42
## 51 51 Female 49 42 52
## 52 52 Male 33 42 60
## 53 53 Female 31 43 54
## 54 54 Male 59 43 60
## 55 55 Female 50 43 45
## 56 56 Male 47 43 41
## 57 57 Female 51 44 50
## 58 58 Male 69 44 46
## 59 59 Female 27 46 51
## 60 60 Male 53 46 46
## 61 61 Male 70 46 56
## 62 62 Male 19 46 55
## 63 63 Female 67 47 52
## 64 64 Female 54 47 59
## 65 65 Male 63 48 51
## 66 66 Male 18 48 59
## 67 67 Female 43 48 50
## 68 68 Female 68 48 48
## 69 69 Male 19 48 59
## 70 70 Female 32 48 47
## 71 71 Male 70 49 55
## 72 72 Female 47 49 42
## 73 73 Female 60 50 49
## 74 74 Female 60 50 56
## 75 75 Male 59 54 47
## 76 76 Male 26 54 54
## 77 77 Female 45 54 53
## 78 78 Male 40 54 48
## 79 79 Female 23 54 52
## 80 80 Female 49 54 42
## 81 81 Male 57 54 51
## 82 82 Male 38 54 55
## 83 83 Male 67 54 41
## 84 84 Female 46 54 44
## 85 85 Female 21 54 57
## 86 86 Male 48 54 46
## 87 87 Female 55 57 58
## 88 88 Female 22 57 55
## 89 89 Female 34 58 60
## 90 90 Female 50 58 46
## 91 91 Female 68 59 55
## 92 92 Male 18 59 41
## 93 93 Male 48 60 49
## 94 94 Female 40 60 40
## 95 95 Female 32 60 42
## 96 96 Male 24 60 52
## 97 97 Female 47 60 47
## 98 98 Female 27 60 50
## 99 99 Male 48 61 42
## 100 100 Male 20 61 49
## 101 101 Female 23 62 41
## 102 102 Female 49 62 48
## 103 103 Male 67 62 59
## 104 104 Male 26 62 55
## 105 105 Male 49 62 56
## 106 106 Female 21 62 42
## 107 107 Female 66 63 50
## 108 108 Male 54 63 46
## 109 109 Male 68 63 43
## 110 110 Male 66 63 48
## 111 111 Male 65 63 52
## 112 112 Female 19 63 54
## 113 113 Female 38 64 42
## 114 114 Male 19 64 46
## 115 115 Female 18 65 48
## 116 116 Female 19 65 50
## 117 117 Female 63 65 43
## 118 118 Female 49 65 59
## 119 119 Female 51 67 43
## 120 120 Female 50 67 57
## 121 121 Male 27 67 56
## 122 122 Female 38 67 40
## 123 123 Female 40 69 58
## 124 124 Male 39 69 91
## 125 125 Female 23 70 29
## 126 126 Female 31 70 77
## 127 127 Male 43 71 35
## 128 128 Male 40 71 95
## 129 129 Male 59 71 11
## 130 130 Male 38 71 75
## 131 131 Male 47 71 9
## 132 132 Male 39 71 75
## 133 133 Female 25 72 34
## 134 134 Female 31 72 71
## 135 135 Male 20 73 5
## 136 136 Female 29 73 88
## 137 137 Female 44 73 7
## 138 138 Male 32 73 73
## 139 139 Male 19 74 10
## 140 140 Female 35 74 72
## 141 141 Female 57 75 5
## 142 142 Male 32 75 93
## 143 143 Female 28 76 40
## 144 144 Female 32 76 87
## 145 145 Male 25 77 12
## 146 146 Male 28 77 97
## 147 147 Male 48 77 36
## 148 148 Female 32 77 74
## 149 149 Female 34 78 22
## 150 150 Male 34 78 90
## 151 151 Male 43 78 17
## 152 152 Male 39 78 88
## 153 153 Female 44 78 20
## 154 154 Female 38 78 76
## 155 155 Female 47 78 16
## 156 156 Female 27 78 89
## 157 157 Male 37 78 1
## 158 158 Female 30 78 78
## 159 159 Male 34 78 1
## 160 160 Female 30 78 73
## 161 161 Female 56 79 35
## 162 162 Female 29 79 83
## 163 163 Male 19 81 5
## 164 164 Female 31 81 93
## 165 165 Male 50 85 26
## 166 166 Female 36 85 75
## 167 167 Male 42 86 20
## 168 168 Female 33 86 95
## 169 169 Female 36 87 27
## 170 170 Male 32 87 63
## 171 171 Male 40 87 13
## 172 172 Male 28 87 75
## 173 173 Male 36 87 10
## 174 174 Male 36 87 92
## 175 175 Female 52 88 13
## 176 176 Female 30 88 86
## 177 177 Male 58 88 15
## 178 178 Male 27 88 69
## 179 179 Male 59 93 14
## 180 180 Male 35 93 90
## 181 181 Female 37 97 32
## 182 182 Female 32 97 86
## 183 183 Male 46 98 15
## 184 184 Female 29 98 88
## 185 185 Female 41 99 39
## 186 186 Male 30 99 97
## 187 187 Female 54 101 24
## 188 188 Male 28 101 68
## 189 189 Female 41 103 17
## 190 190 Female 36 103 85
## 191 191 Female 34 103 23
## 192 192 Female 32 103 69
## 193 193 Male 33 113 8
## 194 194 Female 38 113 91
## 195 195 Female 47 120 16
## 196 196 Female 35 120 79
## 197 197 Female 45 126 28
## 198 198 Male 32 126 74
## 199 199 Male 32 137 18
## 200 200 Male 30 137 83
library(plotly)
## Loading required package: ggplot2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
ggplot(df,aes(Annual.Income..k..,Spending.Score..1.100.))+
geom_point(aes(color=Genre),size=3,bins=30)+coord_cartesian(xlim = c(0, 150), ylim = c(0, 120))+ggtitle("Mall Customer details")+theme_bw()
## Warning: Ignoring unknown parameters: bins
Acquiring required columns
print(df)
## CustomerID Genre Age Annual.Income..k.. Spending.Score..1.100.
## 1 1 Male 19 15 39
## 2 2 Male 21 15 81
## 3 3 Female 20 16 6
## 4 4 Female 23 16 77
## 5 5 Female 31 17 40
## 6 6 Female 22 17 76
## 7 7 Female 35 18 6
## 8 8 Female 23 18 94
## 9 9 Male 64 19 3
## 10 10 Female 30 19 72
## 11 11 Male 67 19 14
## 12 12 Female 35 19 99
## 13 13 Female 58 20 15
## 14 14 Female 24 20 77
## 15 15 Male 37 20 13
## 16 16 Male 22 20 79
## 17 17 Female 35 21 35
## 18 18 Male 20 21 66
## 19 19 Male 52 23 29
## 20 20 Female 35 23 98
## 21 21 Male 35 24 35
## 22 22 Male 25 24 73
## 23 23 Female 46 25 5
## 24 24 Male 31 25 73
## 25 25 Female 54 28 14
## 26 26 Male 29 28 82
## 27 27 Female 45 28 32
## 28 28 Male 35 28 61
## 29 29 Female 40 29 31
## 30 30 Female 23 29 87
## 31 31 Male 60 30 4
## 32 32 Female 21 30 73
## 33 33 Male 53 33 4
## 34 34 Male 18 33 92
## 35 35 Female 49 33 14
## 36 36 Female 21 33 81
## 37 37 Female 42 34 17
## 38 38 Female 30 34 73
## 39 39 Female 36 37 26
## 40 40 Female 20 37 75
## 41 41 Female 65 38 35
## 42 42 Male 24 38 92
## 43 43 Male 48 39 36
## 44 44 Female 31 39 61
## 45 45 Female 49 39 28
## 46 46 Female 24 39 65
## 47 47 Female 50 40 55
## 48 48 Female 27 40 47
## 49 49 Female 29 40 42
## 50 50 Female 31 40 42
## 51 51 Female 49 42 52
## 52 52 Male 33 42 60
## 53 53 Female 31 43 54
## 54 54 Male 59 43 60
## 55 55 Female 50 43 45
## 56 56 Male 47 43 41
## 57 57 Female 51 44 50
## 58 58 Male 69 44 46
## 59 59 Female 27 46 51
## 60 60 Male 53 46 46
## 61 61 Male 70 46 56
## 62 62 Male 19 46 55
## 63 63 Female 67 47 52
## 64 64 Female 54 47 59
## 65 65 Male 63 48 51
## 66 66 Male 18 48 59
## 67 67 Female 43 48 50
## 68 68 Female 68 48 48
## 69 69 Male 19 48 59
## 70 70 Female 32 48 47
## 71 71 Male 70 49 55
## 72 72 Female 47 49 42
## 73 73 Female 60 50 49
## 74 74 Female 60 50 56
## 75 75 Male 59 54 47
## 76 76 Male 26 54 54
## 77 77 Female 45 54 53
## 78 78 Male 40 54 48
## 79 79 Female 23 54 52
## 80 80 Female 49 54 42
## 81 81 Male 57 54 51
## 82 82 Male 38 54 55
## 83 83 Male 67 54 41
## 84 84 Female 46 54 44
## 85 85 Female 21 54 57
## 86 86 Male 48 54 46
## 87 87 Female 55 57 58
## 88 88 Female 22 57 55
## 89 89 Female 34 58 60
## 90 90 Female 50 58 46
## 91 91 Female 68 59 55
## 92 92 Male 18 59 41
## 93 93 Male 48 60 49
## 94 94 Female 40 60 40
## 95 95 Female 32 60 42
## 96 96 Male 24 60 52
## 97 97 Female 47 60 47
## 98 98 Female 27 60 50
## 99 99 Male 48 61 42
## 100 100 Male 20 61 49
## 101 101 Female 23 62 41
## 102 102 Female 49 62 48
## 103 103 Male 67 62 59
## 104 104 Male 26 62 55
## 105 105 Male 49 62 56
## 106 106 Female 21 62 42
## 107 107 Female 66 63 50
## 108 108 Male 54 63 46
## 109 109 Male 68 63 43
## 110 110 Male 66 63 48
## 111 111 Male 65 63 52
## 112 112 Female 19 63 54
## 113 113 Female 38 64 42
## 114 114 Male 19 64 46
## 115 115 Female 18 65 48
## 116 116 Female 19 65 50
## 117 117 Female 63 65 43
## 118 118 Female 49 65 59
## 119 119 Female 51 67 43
## 120 120 Female 50 67 57
## 121 121 Male 27 67 56
## 122 122 Female 38 67 40
## 123 123 Female 40 69 58
## 124 124 Male 39 69 91
## 125 125 Female 23 70 29
## 126 126 Female 31 70 77
## 127 127 Male 43 71 35
## 128 128 Male 40 71 95
## 129 129 Male 59 71 11
## 130 130 Male 38 71 75
## 131 131 Male 47 71 9
## 132 132 Male 39 71 75
## 133 133 Female 25 72 34
## 134 134 Female 31 72 71
## 135 135 Male 20 73 5
## 136 136 Female 29 73 88
## 137 137 Female 44 73 7
## 138 138 Male 32 73 73
## 139 139 Male 19 74 10
## 140 140 Female 35 74 72
## 141 141 Female 57 75 5
## 142 142 Male 32 75 93
## 143 143 Female 28 76 40
## 144 144 Female 32 76 87
## 145 145 Male 25 77 12
## 146 146 Male 28 77 97
## 147 147 Male 48 77 36
## 148 148 Female 32 77 74
## 149 149 Female 34 78 22
## 150 150 Male 34 78 90
## 151 151 Male 43 78 17
## 152 152 Male 39 78 88
## 153 153 Female 44 78 20
## 154 154 Female 38 78 76
## 155 155 Female 47 78 16
## 156 156 Female 27 78 89
## 157 157 Male 37 78 1
## 158 158 Female 30 78 78
## 159 159 Male 34 78 1
## 160 160 Female 30 78 73
## 161 161 Female 56 79 35
## 162 162 Female 29 79 83
## 163 163 Male 19 81 5
## 164 164 Female 31 81 93
## 165 165 Male 50 85 26
## 166 166 Female 36 85 75
## 167 167 Male 42 86 20
## 168 168 Female 33 86 95
## 169 169 Female 36 87 27
## 170 170 Male 32 87 63
## 171 171 Male 40 87 13
## 172 172 Male 28 87 75
## 173 173 Male 36 87 10
## 174 174 Male 36 87 92
## 175 175 Female 52 88 13
## 176 176 Female 30 88 86
## 177 177 Male 58 88 15
## 178 178 Male 27 88 69
## 179 179 Male 59 93 14
## 180 180 Male 35 93 90
## 181 181 Female 37 97 32
## 182 182 Female 32 97 86
## 183 183 Male 46 98 15
## 184 184 Female 29 98 88
## 185 185 Female 41 99 39
## 186 186 Male 30 99 97
## 187 187 Female 54 101 24
## 188 188 Male 28 101 68
## 189 189 Female 41 103 17
## 190 190 Female 36 103 85
## 191 191 Female 34 103 23
## 192 192 Female 32 103 69
## 193 193 Male 33 113 8
## 194 194 Female 38 113 91
## 195 195 Female 47 120 16
## 196 196 Female 35 120 79
## 197 197 Female 45 126 28
## 198 198 Male 32 126 74
## 199 199 Male 32 137 18
## 200 200 Male 30 137 83
Represting with Ploty
library(plotly)
set.seed(955)
dat <- read.csv('Mall_Customers.csv')
data <- dat[4:5]
print(data)
## Annual.Income..k.. Spending.Score..1.100.
## 1 15 39
## 2 15 81
## 3 16 6
## 4 16 77
## 5 17 40
## 6 17 76
## 7 18 6
## 8 18 94
## 9 19 3
## 10 19 72
## 11 19 14
## 12 19 99
## 13 20 15
## 14 20 77
## 15 20 13
## 16 20 79
## 17 21 35
## 18 21 66
## 19 23 29
## 20 23 98
## 21 24 35
## 22 24 73
## 23 25 5
## 24 25 73
## 25 28 14
## 26 28 82
## 27 28 32
## 28 28 61
## 29 29 31
## 30 29 87
## 31 30 4
## 32 30 73
## 33 33 4
## 34 33 92
## 35 33 14
## 36 33 81
## 37 34 17
## 38 34 73
## 39 37 26
## 40 37 75
## 41 38 35
## 42 38 92
## 43 39 36
## 44 39 61
## 45 39 28
## 46 39 65
## 47 40 55
## 48 40 47
## 49 40 42
## 50 40 42
## 51 42 52
## 52 42 60
## 53 43 54
## 54 43 60
## 55 43 45
## 56 43 41
## 57 44 50
## 58 44 46
## 59 46 51
## 60 46 46
## 61 46 56
## 62 46 55
## 63 47 52
## 64 47 59
## 65 48 51
## 66 48 59
## 67 48 50
## 68 48 48
## 69 48 59
## 70 48 47
## 71 49 55
## 72 49 42
## 73 50 49
## 74 50 56
## 75 54 47
## 76 54 54
## 77 54 53
## 78 54 48
## 79 54 52
## 80 54 42
## 81 54 51
## 82 54 55
## 83 54 41
## 84 54 44
## 85 54 57
## 86 54 46
## 87 57 58
## 88 57 55
## 89 58 60
## 90 58 46
## 91 59 55
## 92 59 41
## 93 60 49
## 94 60 40
## 95 60 42
## 96 60 52
## 97 60 47
## 98 60 50
## 99 61 42
## 100 61 49
## 101 62 41
## 102 62 48
## 103 62 59
## 104 62 55
## 105 62 56
## 106 62 42
## 107 63 50
## 108 63 46
## 109 63 43
## 110 63 48
## 111 63 52
## 112 63 54
## 113 64 42
## 114 64 46
## 115 65 48
## 116 65 50
## 117 65 43
## 118 65 59
## 119 67 43
## 120 67 57
## 121 67 56
## 122 67 40
## 123 69 58
## 124 69 91
## 125 70 29
## 126 70 77
## 127 71 35
## 128 71 95
## 129 71 11
## 130 71 75
## 131 71 9
## 132 71 75
## 133 72 34
## 134 72 71
## 135 73 5
## 136 73 88
## 137 73 7
## 138 73 73
## 139 74 10
## 140 74 72
## 141 75 5
## 142 75 93
## 143 76 40
## 144 76 87
## 145 77 12
## 146 77 97
## 147 77 36
## 148 77 74
## 149 78 22
## 150 78 90
## 151 78 17
## 152 78 88
## 153 78 20
## 154 78 76
## 155 78 16
## 156 78 89
## 157 78 1
## 158 78 78
## 159 78 1
## 160 78 73
## 161 79 35
## 162 79 83
## 163 81 5
## 164 81 93
## 165 85 26
## 166 85 75
## 167 86 20
## 168 86 95
## 169 87 27
## 170 87 63
## 171 87 13
## 172 87 75
## 173 87 10
## 174 87 92
## 175 88 13
## 176 88 86
## 177 88 15
## 178 88 69
## 179 93 14
## 180 93 90
## 181 97 32
## 182 97 86
## 183 98 15
## 184 98 88
## 185 99 39
## 186 99 97
## 187 101 24
## 188 101 68
## 189 103 17
## 190 103 85
## 191 103 23
## 192 103 69
## 193 113 8
## 194 113 91
## 195 120 16
## 196 120 79
## 197 126 28
## 198 126 74
## 199 137 18
## 200 137 83
p <- ggplot(data, aes(x=Annual.Income..k.., y=Spending.Score..1.100.)) +
geom_point(shape=13,size=4,alpha=0.4,fill='black',color='red') +theme_bw() +ggtitle("Mall Customer Details")
fig <- ggplotly(p)
fig
Ploting Denogram for find optimal number of cluster
denogram <- hclust(dist(df,method = 'euclidean'),method = 'ward.D')
## Warning in dist(df, method = "euclidean"): NAs introduced by coercion
plot(denogram,xlab = 'Annual Income',ylab='Spending Score',main = 'Finding optimal number of cluster')
Using cutree method to find optimal cut
hcul <- cutree(denogram,5)
Final Analysis
library(cluster)
clusplot(df,hcul,main = paste('MALL CUSTOMER DETAILS'),xlab ='Annual income',ylab='Spending Score',lines = F,
shade = T,span = T,color = T,labels = 2)
library(andrews)
andrews(df,type=2,clr=4, step=100, ymax=3,main="Andrews curve", sub="Mall customers")
x <- df$Age
h<-hist(x, breaks=10, col="red", xlab="Range of Ages",
main="Distribution on Ages")
xfit<-seq(min(x),max(x),length=40)
yfit<-dnorm(xfit,mean=mean(x),sd=sd(x))
yfit <- yfit*diff(h$mids[1:2])*length(x)
lines(xfit, yfit, col="blue", lwd=2)+theme_bw()
## NULL
library(ggplot2)
ggplot(df,aes(Genre,Spending.Score..1.100.))+ geom_boxplot(col='Blue')
library(viridis)
## Loading required package: viridisLite
ggplot(df,aes(Genre,Annual.Income..k..))+geom_violin()+theme_bw()+ylim(0,150)+ggtitle("Gender vs Anual Income")+scale_fill_viridis(option = 1)