Scatter Plot
load("C:/Users/USER/Downloads/cdc.Rdata")
head(cdc)
## genhlth exerany hlthplan smoke100 height weight wtdesire age gender
## 1 good 0 1 0 70 175 175 77 m
## 2 good 0 1 1 64 125 115 33 f
## 3 good 1 1 1 60 105 105 49 f
## 4 good 1 1 0 66 132 124 42 f
## 5 very good 0 1 0 61 150 130 55 f
## 6 very good 1 1 0 64 114 114 55 f
hist(cdc$weight)

plot(cdc$weight, cdc$wtdesire,col=cdc$genhlth)

data(iris)
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
median(iris$Petal.Length)
## [1] 4.35
plot(x=iris$Sepal.Length, y = iris$Petal.Length)
abline(h = median(iris$Petal.Length), col='orange')

plot(x=iris$Sepal.Length, y = iris$Petal.Length, col=iris$Species)

plot(x=iris$Sepal.Length, y = iris$Petal.Length, col=ifelse(iris$Petal.Length >= median(iris$Petal.Length), 'red', 'blue'))
abline(h = median(iris$Petal.Length), col='orange')

setosa <- iris[iris$Species=='setosa',]
versicolor <- iris[iris$Species=='versicolor',]
plot(x=iris$Sepal.Length, y = iris$Petal.Length, type='n')
points(x=setosa$Sepal.Length, y= setosa$Petal.Length, col='green')
points(x=versicolor$Sepal.Length, y= versicolor$Petal.Length, col='red')

fit <- lm(cdc$wtdesire ~ cdc$weight)
fit
##
## Call:
## lm(formula = cdc$wtdesire ~ cdc$weight)
##
## Coefficients:
## (Intercept) cdc$weight
## 46.664 0.639
plot(cdc$weight, cdc$wtdesire)
abline(fit, col='red')

Mosaic Plot
head(cdc)
## genhlth exerany hlthplan smoke100 height weight wtdesire age gender
## 1 good 0 1 0 70 175 175 77 m
## 2 good 0 1 1 64 125 115 33 f
## 3 good 1 1 1 60 105 105 49 f
## 4 good 1 1 0 66 132 124 42 f
## 5 very good 0 1 0 61 150 130 55 f
## 6 very good 1 1 0 64 114 114 55 f
tb <- table(cdc$smoke100)
barplot(tb)

smokers_gender <- table(cdc$gender, cdc$smoke100)
colnames(smokers_gender)
## [1] "0" "1"
colnames(smokers_gender) <- c('no', 'yes')
mosaicplot(smokers_gender, col=c('blue', 'red'))

rainbow(2)
## [1] "#FF0000FF" "#00FFFFFF"
#mosaicplot(smokers_gender, col=rainbow(2))
mosaicplot(smokers_gender, col=rainbow(length(colnames(smokers_gender))))

#smokers_gender
barplot(smokers_gender,col=c("darkblue","red"), beside=TRUE)

barplot(smokers_gender,col=c("darkblue","red"), beside=FALSE)

Boxplot
hist(cdc$weight)

summary(cdc$weight)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 68.0 140.0 165.0 169.7 190.0 500.0
boxplot(cdc$weight)

boxplot(cdc$weight ~ cdc$gender)

salary <- c(45,56,62,70,66,72,54,49,100)
sum(salary) / length(salary)
## [1] 63.77778
mean(salary)
## [1] 63.77778
sort(salary)
## [1] 45 49 54 56 62 66 70 72 100
median(salary)
## [1] 62
quantile(salary, 0.25)
## 25%
## 54
quantile(salary, 0.75)
## 75%
## 70
quantile(salary, 0.75) - quantile(salary, 0.25)
## 75%
## 16
IQR(salary)
## [1] 16
max(min(salary), median(salary) - 1.5 * IQR(salary) )
## [1] 45
lower_adjencent_value <- min(salary[salary > (median(salary) - 1.5 * IQR(salary))])
upper_adgencent_value <- max(salary[salary < (median(salary) + 1.5 * IQR(salary))])
lower_hinge <- quantile(salary, 0.25)
upper_hinge <- quantile(salary, 0.75)
print(paste(lower_adjencent_value, lower_hinge, median(salary), upper_hinge, upper_adgencent_value))
## [1] "45 54 62 70 72"
boxplot(salary)

Legend
taipei <- c(92.5,132.6,168.8,159.1,218.7)
tainan <- c(21.2, 30.6, 37.3, 84.6, 184.3)
plot(taipei, type="o", col="blue",ylim=c(0,300), xlim=c(1,7), xlab="Month", ylab="Rainfall", main = 'Rainfall from Taipei and Tainan')
lines(tainan, type="o", pch=22, lty=2, col="red")
#legend('topleft',c('taipei', 'tainan'), col=c('blue', 'red'), lwd=2.5 , title= 'Rainfall', lty=c(1,2))
legend(x = 1, y = 250,c('taipei', 'tainan'), col=c('blue', 'red'), lwd=2.5 , title= 'Rainfall', lty=c(1,2))
text(x=5, y= 250, 'Taipei', col='blue')
text(x=5.5, y= 200, 'Tainan', col='red')

house <- read.csv('https://raw.githubusercontent.com/ywchiu/rtibame/master/data/house-prices.csv')
bedroomsTable <- table(house$Bedrooms)
pie(bedroomsTable, main="Bedroom Type Calculate",col = rainbow(length(names(bedroomsTable))))
legend('bottomright', legend = names(bedroomsTable), fill = rainbow(length(names(bedroomsTable))),title="units", cex=0.8)

par
showLayout<-function(n){
for( i in 1:n){
plot(1,type="n",xaxt="n",yaxt="n",xlab="",ylab="")
text(1, 1, labels=i, cex=10)
}
}
par(mfrow=c(3,2), mar=c(1,1,1,1))
showLayout(6)
par(mfcol=c(3,2), mar=c(1,1,1,1))
showLayout(6)
par(mfrow=c(3,2), mar=c(3,3,3,3))
showLayout(6)

jpeg("layout.jpg")
par(mfrow=c(3,2), mar=c(3,3,3,3))
showLayout(6)
dev.off()
## png
## 2
Donut Chart
#install.packages('plotly')
library(plotly)
## Warning: package 'plotly' was built under R version 3.4.4
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.4.4
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
ds <- data.frame(labels=c("A", "B", "C"),values =c(10, 20, 30))
plot_ly(ds, labels=ds$labels, values =ds$values, type ="pie", hole=0.6) %>% layout(title="Donut Chart Example")
## Warning: package 'bindrcpp' was built under R version 3.4.4
plot_ly(ds, labels=ds$labels, values =ds$values, type ="pie")
Area Chart
library(plotly)
month<-c(1,2,3,4,5)
taipei<-c(92.5,132.6,168.8,159.1,218.7)
tainan <-c(21.2, 30.6, 37.3, 84.6, 184.3)
plot_ly(x =month, y =taipei, name="taipei",type='scatter', mode='lines')
plot_ly(x =month, y =taipei,fill ="tozeroy", name="taipei",type='scatter', mode='markers') %>% add_trace(x =month, y =tainan, fill ="tozeroy",name="tainan") %>% layout(yaxis=list(title='rainfall'))
total <- taipei + tainan
plot_ly(x =month, y =taipei,fill ="tozeroy", name="taipei",type='scatter', mode='markers') %>% add_trace(x =month, y =total, fill ="tonexty",name="tainan") %>% layout(yaxis=list(title='rainfall'))
Diamonds
data("diamonds")
diamonds
## # A tibble: 53,940 x 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.230 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43
## 2 0.210 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31
## 3 0.230 Good E VS1 56.9 65.0 327 4.05 4.07 2.31
## 4 0.290 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63
## 5 0.310 Good J SI2 63.3 58.0 335 4.34 4.35 2.75
## 6 0.240 Very Good J VVS2 62.8 57.0 336 3.94 3.96 2.48
## 7 0.240 Very Good I VVS1 62.3 57.0 336 3.95 3.98 2.47
## 8 0.260 Very Good H SI1 61.9 55.0 337 4.07 4.11 2.53
## 9 0.220 Fair E VS2 65.1 61.0 337 3.87 3.78 2.49
## 10 0.230 Very Good H VS1 59.4 61.0 338 4.00 4.05 2.39
## # ... with 53,930 more rows
d <-diamonds[sample(nrow(diamonds), 1000), ]
plot_ly(d, x =d$carat, y=d$price, color =d$clarity, type='scatter', mode='markers', size =d$carat, text=paste("Clarity", d$clarity))
Multiple Charts
data("economics")
plot_ly(economics, x =economics$date, y =economics$unemploy, type='scatter', mode='lines')
plot_ly(economics, x =economics$date, y =economics$uempmed, type='scatter', mode='lines')
p <-subplot(
plot_ly(economics, x =economics$date, y =economics$unemploy, type='scatter', mode='lines'),
plot_ly(economics, x =economics$date, y =economics$uempmed, type='scatter', mode='lines'),
margin=0.05
)
p %>%layout(showlegend=FALSE)
p <-subplot(
plot_ly(economics, x =economics$date, y =economics$unemploy, type='scatter', mode='lines'),
plot_ly(economics, x =economics$date, y =economics$uempmed, type='scatter', mode='lines'),
margin=0.05,
nrows = 2
)
p %>%layout(showlegend=FALSE)
#plot_ly(economics, x =economics$date, y =economics$unemploy, type='scatter', mode='lines') %>% add_trace(economics, x =economics$date, y =economics$uempmed, type='scatter', mode='lines')
Sapply Example
a <- data.frame(product = c('牛排煎鍋', '牛排', '煎鍋' ),stringsAsFactors=FALSE)
a
## product
## 1 牛排煎鍋
## 2 牛排
## 3 煎鍋
b <- data.frame(words = c('煎鍋', '汽車'),stringsAsFactors=FALSE)
b$words
## [1] "煎鍋" "汽車"
filter_fun <- function(products){
s <- sum(sapply(b$words, function(e) grepl(pattern = e,x = products) )) >= 1
return(s)
}
a$product[! sapply(a$product, filter_fun)]
## [1] "牛排"
RPART
data(iris)
library(rpart)
# y = ax +b
fit <- rpart(Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, data = iris)
fit
## n= 150
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 150 100 setosa (0.33333333 0.33333333 0.33333333)
## 2) Petal.Length< 2.45 50 0 setosa (1.00000000 0.00000000 0.00000000) *
## 3) Petal.Length>=2.45 100 50 versicolor (0.00000000 0.50000000 0.50000000)
## 6) Petal.Width< 1.75 54 5 versicolor (0.00000000 0.90740741 0.09259259) *
## 7) Petal.Width>=1.75 46 1 virginica (0.00000000 0.02173913 0.97826087) *
summary(fit)
## Call:
## rpart(formula = Species ~ Sepal.Length + Sepal.Width + Petal.Length +
## Petal.Width, data = iris)
## n= 150
##
## CP nsplit rel error xerror xstd
## 1 0.50 0 1.00 1.18 0.05017303
## 2 0.44 1 0.50 0.78 0.06118823
## 3 0.01 2 0.06 0.10 0.03055050
##
## Variable importance
## Petal.Width Petal.Length Sepal.Length Sepal.Width
## 34 31 21 14
##
## Node number 1: 150 observations, complexity param=0.5
## predicted class=setosa expected loss=0.6666667 P(node) =1
## class counts: 50 50 50
## probabilities: 0.333 0.333 0.333
## left son=2 (50 obs) right son=3 (100 obs)
## Primary splits:
## Petal.Length < 2.45 to the left, improve=50.00000, (0 missing)
## Petal.Width < 0.8 to the left, improve=50.00000, (0 missing)
## Sepal.Length < 5.45 to the left, improve=34.16405, (0 missing)
## Sepal.Width < 3.35 to the right, improve=19.03851, (0 missing)
## Surrogate splits:
## Petal.Width < 0.8 to the left, agree=1.000, adj=1.00, (0 split)
## Sepal.Length < 5.45 to the left, agree=0.920, adj=0.76, (0 split)
## Sepal.Width < 3.35 to the right, agree=0.833, adj=0.50, (0 split)
##
## Node number 2: 50 observations
## predicted class=setosa expected loss=0 P(node) =0.3333333
## class counts: 50 0 0
## probabilities: 1.000 0.000 0.000
##
## Node number 3: 100 observations, complexity param=0.44
## predicted class=versicolor expected loss=0.5 P(node) =0.6666667
## class counts: 0 50 50
## probabilities: 0.000 0.500 0.500
## left son=6 (54 obs) right son=7 (46 obs)
## Primary splits:
## Petal.Width < 1.75 to the left, improve=38.969400, (0 missing)
## Petal.Length < 4.75 to the left, improve=37.353540, (0 missing)
## Sepal.Length < 6.15 to the left, improve=10.686870, (0 missing)
## Sepal.Width < 2.45 to the left, improve= 3.555556, (0 missing)
## Surrogate splits:
## Petal.Length < 4.75 to the left, agree=0.91, adj=0.804, (0 split)
## Sepal.Length < 6.15 to the left, agree=0.73, adj=0.413, (0 split)
## Sepal.Width < 2.95 to the left, agree=0.67, adj=0.283, (0 split)
##
## Node number 6: 54 observations
## predicted class=versicolor expected loss=0.09259259 P(node) =0.36
## class counts: 0 49 5
## probabilities: 0.000 0.907 0.093
##
## Node number 7: 46 observations
## predicted class=virginica expected loss=0.02173913 P(node) =0.3066667
## class counts: 0 1 45
## probabilities: 0.000 0.022 0.978
plot(fit, margin =0.1)
text(fit)

plot(Petal.Width ~ Petal.Length, data = iris, col=iris$Species)
abline(v = 2.45, col='blue')
abline(h = 1.75, col='orange')

predicted <- predict(fit, iris, type= 'class')
table(iris$Species, predicted)
## predicted
## setosa versicolor virginica
## setosa 50 0 0
## versicolor 0 49 1
## virginica 0 5 45
#(50 + 49 + 45 ) / 150
#install.packages('caret')
#install.packages('e1071')
library(caret)
## Warning: package 'caret' was built under R version 3.4.4
## Loading required package: lattice
cm<-table(predict(fit, iris[,1:4], type="class"), iris[,5])
confusionMatrix(cm)
## Confusion Matrix and Statistics
##
##
## setosa versicolor virginica
## setosa 50 0 0
## versicolor 0 49 5
## virginica 0 1 45
##
## Overall Statistics
##
## Accuracy : 0.96
## 95% CI : (0.915, 0.9852)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.94
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 0.9800 0.9000
## Specificity 1.0000 0.9500 0.9900
## Pos Pred Value 1.0000 0.9074 0.9783
## Neg Pred Value 1.0000 0.9896 0.9519
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3267 0.3000
## Detection Prevalence 0.3333 0.3600 0.3067
## Balanced Accuracy 1.0000 0.9650 0.9450
df <- data.frame(x = c(1,2,3), y= c(2,3,4), points = c('a','b', 'a'))
plot(y ~ x, data = df, col=c('red', 'blue', 'green'))

plot(y ~ x, data = df, col=c(1,2,1))

str(df)
## 'data.frame': 3 obs. of 3 variables:
## $ x : num 1 2 3
## $ y : num 2 3 4
## $ points: Factor w/ 2 levels "a","b": 1 2 1
plot(y ~ x, data = df, col=df$points)

set.seed(123)
sample.int(42,6)
## [1] 13 33 17 35 36 2
set.seed(123)
idx <- sample.int(2, nrow(iris), replace=TRUE, prob = c(0.7,0.3))
trainset <- iris[idx == 1,]
testset <- iris[idx == 2,]
dim(trainset)
## [1] 106 5
dim(testset)
## [1] 44 5
fit <- rpart(Species ~ ., data= iris)
plot(fit, margin = 0.1)
text(fit)

predicted <- predict(fit, testset, type= 'class')
tb <- table(testset$Species, predicted)
(15 + 13 +13) / (15 + 14 + 15)
## [1] 0.9318182
confusionMatrix(tb)
## Confusion Matrix and Statistics
##
## predicted
## setosa versicolor virginica
## setosa 15 0 0
## versicolor 0 13 1
## virginica 0 2 13
##
## Overall Statistics
##
## Accuracy : 0.9318
## 95% CI : (0.8134, 0.9857)
## No Information Rate : 0.3409
## P-Value [Acc > NIR] : 2.711e-16
##
## Kappa : 0.8978
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 0.8667 0.9286
## Specificity 1.0000 0.9655 0.9333
## Pos Pred Value 1.0000 0.9286 0.8667
## Neg Pred Value 1.0000 0.9333 0.9655
## Prevalence 0.3409 0.3409 0.3182
## Detection Rate 0.3409 0.2955 0.2955
## Detection Prevalence 0.3409 0.3182 0.3409
## Balanced Accuracy 1.0000 0.9161 0.9310
Logistic Regression
dataset <- iris[iris$Species!= 'setosa', ]
dataset$Species <- factor(dataset$Species)
fit <- glm(Species ~., data = dataset, family='binomial')
summary(fit)
##
## Call:
## glm(formula = Species ~ ., family = "binomial", data = dataset)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.01105 -0.00541 -0.00001 0.00677 1.78065
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -42.638 25.707 -1.659 0.0972 .
## Sepal.Length -2.465 2.394 -1.030 0.3032
## Sepal.Width -6.681 4.480 -1.491 0.1359
## Petal.Length 9.429 4.737 1.991 0.0465 *
## Petal.Width 18.286 9.743 1.877 0.0605 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 138.629 on 99 degrees of freedom
## Residual deviance: 11.899 on 95 degrees of freedom
## AIC: 21.899
##
## Number of Fisher Scoring iterations: 10
predicted <- predict(fit, dataset, type= 'response')
summary(predicted)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00e+00 1.48e-05 5.37e-01 5.00e-01 1.00e+00 1.00e+00
res <- factor(ifelse(predicted > 0.5, 'virginica', 'versicolor'))
table(dataset$Species, res)
## res
## versicolor virginica
## versicolor 49 1
## virginica 1 49
SVM
#install.packages('e1071')
library(e1071)
## Warning: package 'e1071' was built under R version 3.4.4
fit <- rpart(Species ~., data = iris)
predicted <- predict(fit, iris, type = 'class')
table(iris$Species, predicted )
## predicted
## setosa versicolor virginica
## setosa 50 0 0
## versicolor 0 49 1
## virginica 0 5 45
fit2 <- svm(Species ~., data = iris)
predicted2 <- predict(fit2, iris)
table(iris$Species, predicted2 )
## predicted2
## setosa versicolor virginica
## setosa 50 0 0
## versicolor 0 48 2
## virginica 0 2 48
?svm
## starting httpd help server ... done
Churn Analysis
library(C50)
## Warning: package 'C50' was built under R version 3.4.4
data(churn)
head(churnTrain)
## state account_length area_code international_plan voice_mail_plan
## 1 KS 128 area_code_415 no yes
## 2 OH 107 area_code_415 no yes
## 3 NJ 137 area_code_415 no no
## 4 OH 84 area_code_408 yes no
## 5 OK 75 area_code_415 yes no
## 6 AL 118 area_code_510 yes no
## number_vmail_messages total_day_minutes total_day_calls total_day_charge
## 1 25 265.1 110 45.07
## 2 26 161.6 123 27.47
## 3 0 243.4 114 41.38
## 4 0 299.4 71 50.90
## 5 0 166.7 113 28.34
## 6 0 223.4 98 37.98
## total_eve_minutes total_eve_calls total_eve_charge total_night_minutes
## 1 197.4 99 16.78 244.7
## 2 195.5 103 16.62 254.4
## 3 121.2 110 10.30 162.6
## 4 61.9 88 5.26 196.9
## 5 148.3 122 12.61 186.9
## 6 220.6 101 18.75 203.9
## total_night_calls total_night_charge total_intl_minutes total_intl_calls
## 1 91 11.01 10.0 3
## 2 103 11.45 13.7 3
## 3 104 7.32 12.2 5
## 4 89 8.86 6.6 7
## 5 121 8.41 10.1 3
## 6 118 9.18 6.3 6
## total_intl_charge number_customer_service_calls churn
## 1 2.70 1 no
## 2 3.70 1 no
## 3 3.29 0 no
## 4 1.78 2 no
## 5 2.73 3 no
## 6 1.70 0 no
churnTrain <- churnTrain[ , ! names(churnTrain) %in% c('state', 'account_length', 'area_code')]
dim(churnTrain)
## [1] 3333 17
set.seed(2)
idx <- sample.int(2, nrow(churnTrain), replace=TRUE, prob=c(0.7,0.3))
trainset <- churnTrain[idx == 1, ]
testset <- churnTrain[idx == 2, ]
library(rpart)
churn.rp <- rpart(churn ~ ., data = trainset)
predicted <- predict(churn.rp, testset, type = 'class')
table(testset$churn, predicted)
## predicted
## yes no
## yes 100 41
## no 18 859
plot(churn.rp, margin= 0.1)
text(churn.rp)

# kfold
idx <- sample.int(10, nrow(churnTrain), replace=TRUE)
for (i in 1:10){
trainset <- churnTrain[idx != i, ]
testset <- churnTrain[idx == i, ]
churn.rp <- rpart(churn ~., data = trainset)
predicted <- predict(churn.rp, testset, type='class')
#print(predicted)
print( sum(predicted == testset$churn ) / length(predicted))
}
## [1] 0.9650794
## [1] 0.9309392
## [1] 0.9439528
## [1] 0.9391026
## [1] 0.925
## [1] 0.9345794
## [1] 0.942623
## [1] 0.9589443
## [1] 0.9464883
## [1] 0.9441341
library(caret)
control <- trainControl(method="repeatedcv", number=10, repeats=3)
model <- train(churn~., data=trainset, method="rpart", preProcess="scale", trControl=control)
predicted <- predict(model, testset)
table(testset$churn, predicted)
## predicted
## yes no
## yes 23 26
## no 5 304
ROC Curve
fit <- rpart(churn ~., trainset)
predicted <- predict(fit, testset, type = 'prob')
#predicted
fpr <- c(0)
tpr <- c(0)
predicted[, 1]
## 3 32 36 44 46 53
## 0.11379310 0.11379310 0.02851426 0.02851426 0.02851426 0.02851426
## 72 82 113 124 127 130
## 0.11379310 0.04697987 0.92307692 0.02851426 0.88043478 0.02851426
## 154 157 160 181 201 208
## 0.11379310 0.84000000 0.02851426 0.09302326 0.02851426 0.02851426
## 212 219 225 235 238 239
## 0.04697987 0.73529412 0.02851426 0.04697987 0.02851426 0.02851426
## 241 258 260 261 270 280
## 0.02851426 0.02851426 0.02851426 0.02851426 0.04697987 0.04697987
## 291 302 315 321 325 332
## 0.11379310 0.84000000 0.02851426 0.02851426 0.02851426 0.84000000
## 341 365 370 387 388 394
## 0.02851426 0.02851426 0.02851426 0.02851426 0.02851426 0.02851426
## 395 402 403 419 422 442
## 0.11379310 0.02851426 0.02851426 0.02851426 0.11379310 0.02851426
## 443 456 463 468 484 491
## 0.02851426 0.84000000 0.02851426 0.02851426 0.02851426 0.02851426
## 494 498 503 516 519 536
## 0.02851426 0.02851426 1.00000000 0.02851426 0.02851426 0.02851426
## 543 565 587 618 622 628
## 0.09302326 0.11379310 0.04697987 0.02851426 0.02851426 0.09302326
## 635 641 643 644 649 651
## 0.02851426 0.02851426 0.02851426 0.02851426 0.02851426 0.11379310
## 657 663 667 669 682 684
## 0.02851426 0.02851426 0.11379310 0.11379310 0.02851426 0.02851426
## 695 705 742 743 744 751
## 0.88043478 0.02851426 0.02851426 0.09302326 0.02851426 0.02851426
## 755 760 774 781 792 793
## 0.02851426 0.02851426 1.00000000 0.84000000 0.02851426 0.10000000
## 802 804 829 831 836 841
## 0.11379310 0.02851426 0.02851426 1.00000000 0.02851426 0.02851426
## 863 874 892 911 916 919
## 0.02851426 0.02851426 0.02851426 0.02851426 0.02851426 0.02851426
## 921 922 928 932 940 947
## 0.02851426 0.02851426 0.02851426 0.02851426 0.02851426 0.11379310
## 950 985 992 995 997 1005
## 0.11379310 0.02851426 0.02851426 0.11379310 0.02851426 0.02851426
## 1015 1018 1035 1038 1043 1056
## 0.02851426 0.02851426 0.02851426 0.04697987 0.04697987 0.02851426
## 1064 1076 1080 1086 1108 1112
## 0.02851426 0.02851426 0.02851426 0.11379310 0.02851426 0.02851426
## 1115 1117 1138 1144 1149 1170
## 0.02851426 0.02851426 0.02851426 0.02851426 0.02851426 0.04697987
## 1171 1173 1183 1193 1194 1215
## 0.02851426 0.02851426 0.84000000 0.09302326 0.88043478 0.02851426
## 1220 1221 1223 1241 1253 1259
## 0.02851426 0.02851426 0.02851426 0.92307692 0.11379310 0.04697987
## 1265 1272 1275 1297 1309 1310
## 0.02851426 0.04697987 0.02851426 0.02851426 0.02851426 0.04697987
## 1313 1328 1333 1334 1341 1367
## 0.02851426 0.02851426 0.11379310 0.02851426 0.02851426 0.02851426
## 1385 1389 1433 1498 1507 1510
## 0.02851426 0.09302326 0.02851426 0.02851426 0.11379310 0.11379310
## 1514 1516 1527 1530 1533 1551
## 0.11379310 0.02851426 0.11379310 0.02851426 0.02851426 0.02851426
## 1566 1587 1598 1599 1611 1612
## 0.02851426 0.02851426 0.02851426 0.02851426 0.00000000 0.02851426
## 1613 1665 1676 1682 1688 1691
## 0.02851426 0.02851426 0.02851426 0.02851426 0.02851426 0.02851426
## 1702 1703 1711 1712 1717 1718
## 0.84000000 0.88043478 0.02851426 0.04697987 0.02851426 0.02851426
## 1721 1725 1729 1732 1746 1748
## 0.02851426 0.02851426 0.88043478 0.73529412 0.11379310 0.02851426
## 1793 1803 1809 1826 1841 1842
## 0.02851426 0.09302326 0.11379310 0.02851426 0.11379310 0.02851426
## 1846 1855 1861 1873 1901 1907
## 1.00000000 0.02851426 0.02851426 0.04697987 0.02851426 0.02851426
## 1913 1926 1928 1932 1937 1954
## 0.88043478 0.02851426 0.02851426 0.04697987 0.02851426 0.02851426
## 1958 1959 2000 2006 2008 2020
## 0.02851426 0.02851426 0.02851426 0.02851426 0.11379310 0.02851426
## 2022 2023 2039 2047 2074 2096
## 0.02851426 0.02851426 0.84000000 0.02851426 0.02851426 0.11379310
## 2108 2110 2114 2118 2120 2135
## 0.73529412 0.02851426 0.84000000 0.09302326 0.11379310 0.02851426
## 2138 2153 2201 2215 2220 2221
## 0.02851426 0.04697987 0.02851426 0.02851426 0.02851426 0.02851426
## 2229 2232 2238 2258 2267 2268
## 0.02851426 0.02851426 0.92307692 0.02851426 0.02851426 0.84000000
## 2286 2291 2307 2313 2315 2325
## 0.02851426 0.11379310 0.11379310 0.02851426 0.02851426 0.88043478
## 2327 2348 2352 2393 2412 2436
## 0.11379310 1.00000000 0.02851426 0.02851426 0.02851426 0.10000000
## 2444 2449 2454 2463 2464 2466
## 0.02851426 0.11379310 0.02851426 0.02851426 0.02851426 0.02851426
## 2468 2476 2481 2484 2491 2496
## 0.04697987 0.02851426 0.04697987 0.02851426 0.02851426 0.02851426
## 2509 2515 2523 2530 2531 2532
## 0.02851426 0.02851426 0.02851426 0.02851426 0.84000000 0.02851426
## 2536 2538 2541 2547 2556 2566
## 0.02851426 0.02851426 0.02851426 0.02851426 0.02851426 0.02851426
## 2567 2581 2590 2607 2615 2628
## 0.02851426 0.02851426 0.02851426 0.02851426 0.84000000 0.02851426
## 2629 2648 2659 2665 2676 2727
## 0.09302326 0.84000000 0.11379310 1.00000000 0.02851426 0.02851426
## 2728 2753 2754 2756 2762 2764
## 0.04697987 0.02851426 0.02851426 0.02851426 0.02851426 0.02851426
## 2767 2776 2777 2782 2785 2788
## 0.04697987 0.11379310 0.02851426 0.02851426 0.11379310 0.02851426
## 2794 2796 2797 2808 2832 2833
## 0.10000000 0.02851426 0.11379310 0.02851426 0.02851426 0.02851426
## 2840 2853 2854 2863 2878 2896
## 0.84000000 0.02851426 0.02851426 0.02851426 0.02851426 0.04697987
## 2902 2906 2908 2909 2917 2920
## 0.88043478 0.02851426 0.02851426 0.84000000 0.02851426 0.02851426
## 2924 2934 2936 2943 2959 2972
## 0.02851426 0.04697987 0.02851426 0.84000000 0.88043478 1.00000000
## 2975 3005 3018 3021 3055 3056
## 0.02851426 0.02851426 0.09302326 0.02851426 0.02851426 0.02851426
## 3067 3076 3081 3083 3101 3104
## 0.02851426 0.02851426 0.02851426 0.02851426 0.02851426 0.02851426
## 3106 3110 3129 3145 3150 3154
## 0.02851426 0.04697987 0.02851426 0.88043478 0.02851426 0.02851426
## 3164 3184 3199 3209 3210 3231
## 0.73529412 0.02851426 0.02851426 0.11379310 1.00000000 0.02851426
## 3236 3237 3238 3257 3260 3273
## 0.02851426 0.11379310 0.04697987 0.84000000 0.02851426 0.84000000
## 3286 3322 3329 3330
## 0.02851426 0.02851426 0.02851426 0.11379310
for (i in seq(0,1,0.01)){
print(i)
predicted_prob <- factor(ifelse(predicted[, 1] > i, 'no', 'yes') )
#print(predicted_prob)
tb <- table(testset$churn, predicted_prob)
#print(length(tb))
if(length(tb) == 4){
TP <- tb[1,1]
FP <- tb[1,2]
FN <- tb[2,1]
TN <- tb[1,1]
tpr <- c(tpr, TP / (TP + FN))
fpr <- c(fpr, FP / (FP + TN))
}
}
## [1] 0
## [1] 0.01
## [1] 0.02
## [1] 0.03
## [1] 0.04
## [1] 0.05
## [1] 0.06
## [1] 0.07
## [1] 0.08
## [1] 0.09
## [1] 0.1
## [1] 0.11
## [1] 0.12
## [1] 0.13
## [1] 0.14
## [1] 0.15
## [1] 0.16
## [1] 0.17
## [1] 0.18
## [1] 0.19
## [1] 0.2
## [1] 0.21
## [1] 0.22
## [1] 0.23
## [1] 0.24
## [1] 0.25
## [1] 0.26
## [1] 0.27
## [1] 0.28
## [1] 0.29
## [1] 0.3
## [1] 0.31
## [1] 0.32
## [1] 0.33
## [1] 0.34
## [1] 0.35
## [1] 0.36
## [1] 0.37
## [1] 0.38
## [1] 0.39
## [1] 0.4
## [1] 0.41
## [1] 0.42
## [1] 0.43
## [1] 0.44
## [1] 0.45
## [1] 0.46
## [1] 0.47
## [1] 0.48
## [1] 0.49
## [1] 0.5
## [1] 0.51
## [1] 0.52
## [1] 0.53
## [1] 0.54
## [1] 0.55
## [1] 0.56
## [1] 0.57
## [1] 0.58
## [1] 0.59
## [1] 0.6
## [1] 0.61
## [1] 0.62
## [1] 0.63
## [1] 0.64
## [1] 0.65
## [1] 0.66
## [1] 0.67
## [1] 0.68
## [1] 0.69
## [1] 0.7
## [1] 0.71
## [1] 0.72
## [1] 0.73
## [1] 0.74
## [1] 0.75
## [1] 0.76
## [1] 0.77
## [1] 0.78
## [1] 0.79
## [1] 0.8
## [1] 0.81
## [1] 0.82
## [1] 0.83
## [1] 0.84
## [1] 0.85
## [1] 0.86
## [1] 0.87
## [1] 0.88
## [1] 0.89
## [1] 0.9
## [1] 0.91
## [1] 0.92
## [1] 0.93
## [1] 0.94
## [1] 0.95
## [1] 0.96
## [1] 0.97
## [1] 0.98
## [1] 0.99
## [1] 1
tpr <- c(tpr,1)
fpr <- c(fpr,1)
plot(fpr, tpr , type= 'l')

#install.packages('ROCR')
library(ROCR)
## Warning: package 'ROCR' was built under R version 3.4.4
## Loading required package: gplots
## Warning: package 'gplots' was built under R version 3.4.4
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
predictions <-predict(churn.rp, testset, type="prob")
pred.to.roc<-predictions[, 1]
pred.rocr<-prediction(pred.to.roc, as.factor(testset[,(dim(testset)[[2]])]))
perf.rocr<-performance(pred.rocr, measure ="auc", x.measure="cutoff")
perf.tpr.rocr<-performance(pred.rocr, "tpr","fpr")
plot(perf.tpr.rocr, colorize=T,main=paste("AUC:",(perf.rocr@y.values)))
