#Question 4.1

I work at an engineering consulting firm, clustering modle would be appropriate to determine where to place a pump station. Some of the predictors that I might use are the distance to the residential area, the location’s elevation, and the distance to the water treatment plant.

#Question 4.2

rm(list = ls())

library(datasets)
library(cluster)
library(factoextra)
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#import data
data("iris")
fd <- iris
head (fd)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
#Remove species column and scale the data
fd.scaled <- scale(fd[,-5])

#Setting the random number generator
set.seed(1)

#From the plot, I determined the optimal number of clusters is 3.
kmeans_clusters <- kmeans(fd.scaled, 3, nstart = 25)

#Clustering information
str(kmeans_clusters)
## List of 9
##  $ cluster     : int [1:150] 1 1 1 1 1 1 1 1 1 1 ...
##  $ centers     : num [1:3, 1:4] -1.0112 -0.0501 1.1322 0.8504 -0.8804 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:3] "1" "2" "3"
##   .. ..$ : chr [1:4] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
##  $ totss       : num 596
##  $ withinss    : num [1:3] 47.4 44.1 47.5
##  $ tot.withinss: num 139
##  $ betweenss   : num 457
##  $ size        : int [1:3] 50 53 47
##  $ iter        : int 2
##  $ ifault      : int 0
##  - attr(*, "class")= chr "kmeans"

##Question 5.1

#load library
library(outliers)

#Import data
crime_data <- read.table("uscrime.txt", header = TRUE, stringsAsFactors = FALSE)

#Show data
head(crime_data)
##      M So   Ed  Po1  Po2    LF   M.F Pop   NW    U1  U2 Wealth Ineq     Prob
## 1 15.1  1  9.1  5.8  5.6 0.510  95.0  33 30.1 0.108 4.1   3940 26.1 0.084602
## 2 14.3  0 11.3 10.3  9.5 0.583 101.2  13 10.2 0.096 3.6   5570 19.4 0.029599
## 3 14.2  1  8.9  4.5  4.4 0.533  96.9  18 21.9 0.094 3.3   3180 25.0 0.083401
## 4 13.6  0 12.1 14.9 14.1 0.577  99.4 157  8.0 0.102 3.9   6730 16.7 0.015801
## 5 14.1  0 12.1 10.9 10.1 0.591  98.5  18  3.0 0.091 2.0   5780 17.4 0.041399
## 6 12.1  0 11.0 11.8 11.5 0.547  96.4  25  4.4 0.084 2.9   6890 12.6 0.034201
##      Time Crime
## 1 26.2011   791
## 2 25.2999  1635
## 3 24.3006   578
## 4 29.9012  1969
## 5 21.2998  1234
## 6 20.9995   682
#test 1-Using type = 10 to show one outlier
grubbs.test(crime_data[,16], type = 10)
## 
##  Grubbs test for one outlier
## 
## data:  crime_data[, 16]
## G = 2.81287, U = 0.82426, p-value = 0.07887
## alternative hypothesis: highest value 1993 is an outlier
#test 2-Using type = 11 to show two opposite outliers
grubbs.test(crime_data[,16], type = 11 )
## 
##  Grubbs test for two opposite outliers
## 
## data:  crime_data[, 16]
## G = 4.26877, U = 0.78103, p-value = 1
## alternative hypothesis: 342 and 1993 are outliers
#Using boxpolt to show outliers
boxplot.stats(crime_data$Crime)$out
## [1] 1969 1674 1993

#Question6.1 Traffic jam happens a lot in the area that I live. To avoid the traffic I could use a Change Detection model to identify when there is a significant increase of cars in the highway. I would choose the Thrshold based on average number of cars during a traffic jam and a relatively large critical value to make the model less sensitive.