library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.3
## ✓ tibble 3.0.1 ✓ dplyr 1.0.0
## ✓ tidyr 1.0.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ───────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
library(lemon)
##
## Attaching package: 'lemon'
## The following object is masked from 'package:purrr':
##
## %||%
## The following objects are masked from 'package:ggplot2':
##
## CoordCartesian, element_render
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(dplyr)
library(cluster)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
mall_customers <- read_csv("Mall_Customers.csv")
## Parsed with column specification:
## cols(
## CustomerID = col_double(),
## Sex = col_character(),
## Age = col_double(),
## Annual_Income_k = col_double(),
## Spending_Score = col_double()
## )
view(mall_customers)
Question #1 Conduct basic exploratory data analysis with the mall_customers.csv data set. Create 3 graphs of your choosing. For each, provide a 1-2 sentence summary of what you see.
mall_customers %>%
ggplot(aes(x = Age, y = Spending_Score)) +
geom_smooth(se = FALSE) +
labs(x = "Age", y = "Spending Score", title = "Customers spending score by Age") +
theme_light()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
mall_customers %>%
ggplot(aes(x = Sex , y = Annual_Income_k)) +
geom_point() +
labs(x = "Sex", y = "Annual Income in thousands", title = "Annual Income for Female and Male Customers") +
scale_y_continuous(labels = dollar_format())
mall_customers %>%
ggplot(aes(x = Age)) +
geom_histogram(bins = 5, col = "grey", fill = "green") +
labs(x = "Costumer's Age", y = "Age count", title = "Average Costumer Age")
scale_x_continuous(labels = dollar_format())
## <ScaleContinuousPosition>
## Range:
## Limits: 0 -- 1
Question #2 Create clusters that look at both the annual income and spending score (your clustering should only look at these two columns).
Create an elbow plot and write a brief interpretation of 2-3 sentences for it. Create a silhouette plot and write a brief interpretatin of 2-3 sentences for it. Make a recommendation for the correct amount of clusters that should be used for this data set (there is no one right answer).
set.seed(1)
fviz_clusters <- kmeans(mall_customers[, 4:5], centers = 5, nstart = 10)
fviz_clusters$centers
## Annual_Income_k Spending_Score
## 1 25.72727 79.36364
## 2 88.20000 17.11429
## 3 55.29630 49.51852
## 4 86.53846 82.12821
## 5 26.30435 20.91304
fviz_clusters$betweenss
## [1] 225532.8
Question #3 Plot your best k-means model as a scatter plot with the centroids displayed.
mall_customers_cluster_df <- mall_customers %>%
mutate(cluster = as.character(fviz_clusters$cluster))
mall_customers_cluster_df %>%
ggplot(aes(x = Annual_Income_k, y = Spending_Score, color = cluster)) +
geom_point() +
labs(title = "Annual Income and Spending Score cluster", color = "Spending Score")
Question #4 Write 4-5 sentences explaining what can be done with this new insight if you were in charge of the marketing and sales operation of the mall.
With the insights above we can see how the customers with the least annual income tend to have a higher Spending Score and as Customers make more money their spending score decrease. We can also obeserve how younger customers have a higher spending score and that male customers tend to make more money than female customers.