#Question 4.1
I work at an engineering consulting firm, clustering modle would be appropriate to determine where to place a pump station. Some of the predictors that I might use are the distance to the residential area, the location’s elevation, and the distance to the water treatment plant.
#Question 4.2
rm(list = ls())
library(datasets)
library(cluster)
library(factoextra)
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#import data
data("iris")
fd <- iris
head (fd)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
#Remove species column and scale the data
fd.scaled <- scale(fd[,-5])
#Setting the random number generator
set.seed(1)
#From the plot, I determined the optimal number of clusters is 3.
kmeans_clusters <- kmeans(fd.scaled, 3, nstart = 25)
#Clustering information
str(kmeans_clusters)
## List of 9
## $ cluster : int [1:150] 1 1 1 1 1 1 1 1 1 1 ...
## $ centers : num [1:3, 1:4] -1.0112 -0.0501 1.1322 0.8504 -0.8804 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:3] "1" "2" "3"
## .. ..$ : chr [1:4] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
## $ totss : num 596
## $ withinss : num [1:3] 47.4 44.1 47.5
## $ tot.withinss: num 139
## $ betweenss : num 457
## $ size : int [1:3] 50 53 47
## $ iter : int 2
## $ ifault : int 0
## - attr(*, "class")= chr "kmeans"
##Question 5.1
#load library
library(outliers)
#Import data
crime_data <- read.table("uscrime.txt", header = TRUE, stringsAsFactors = FALSE)
#Show data
head(crime_data)
## M So Ed Po1 Po2 LF M.F Pop NW U1 U2 Wealth Ineq Prob
## 1 15.1 1 9.1 5.8 5.6 0.510 95.0 33 30.1 0.108 4.1 3940 26.1 0.084602
## 2 14.3 0 11.3 10.3 9.5 0.583 101.2 13 10.2 0.096 3.6 5570 19.4 0.029599
## 3 14.2 1 8.9 4.5 4.4 0.533 96.9 18 21.9 0.094 3.3 3180 25.0 0.083401
## 4 13.6 0 12.1 14.9 14.1 0.577 99.4 157 8.0 0.102 3.9 6730 16.7 0.015801
## 5 14.1 0 12.1 10.9 10.1 0.591 98.5 18 3.0 0.091 2.0 5780 17.4 0.041399
## 6 12.1 0 11.0 11.8 11.5 0.547 96.4 25 4.4 0.084 2.9 6890 12.6 0.034201
## Time Crime
## 1 26.2011 791
## 2 25.2999 1635
## 3 24.3006 578
## 4 29.9012 1969
## 5 21.2998 1234
## 6 20.9995 682
#test 1-Using type = 10 to show one outlier
grubbs.test(crime_data[,16], type = 10)
##
## Grubbs test for one outlier
##
## data: crime_data[, 16]
## G = 2.81287, U = 0.82426, p-value = 0.07887
## alternative hypothesis: highest value 1993 is an outlier
#test 2-Using type = 11 to show two opposite outliers
grubbs.test(crime_data[,16], type = 11 )
##
## Grubbs test for two opposite outliers
##
## data: crime_data[, 16]
## G = 4.26877, U = 0.78103, p-value = 1
## alternative hypothesis: 342 and 1993 are outliers
#Using boxpolt to show outliers
boxplot.stats(crime_data$Crime)$out
## [1] 1969 1674 1993
#Question6.1 Traffic jam happens a lot in the area that I live. To avoid the traffic I could use a Change Detection model to identify when there is a significant increase of cars in the highway. I would choose the Thrshold based on average number of cars during a traffic jam and a relatively large critical value to make the model less sensitive.